From 080c26edd64c149e1a3d212fbbf3e1c54b751516 Mon Sep 17 00:00:00 2001 From: Marat Saidov Date: Mon, 2 Jan 2023 12:06:43 +0700 Subject: [PATCH 1/3] refactoring for atd repo --- .dvc/plots/confusion.json | 107 --------- .dvc/plots/confusion_normalized.json | 112 ---------- .dvc/plots/linear.json | 116 ---------- .dvc/plots/scatter.json | 104 --------- .dvc/plots/simple.json | 31 --- .dvc/plots/smooth.json | 39 ---- .../data/README.md => docs/data.md | 0 .../README.md => docs/framework.md | 0 .../models/README.md => docs/models.md | 0 scripts/bertscore.py | 206 ------------------ scripts/load_bert_score.py | 6 - scripts/load_bleurt.py | 6 - scripts/load_comet.py | 6 - scripts/load_xlm_large.py | 7 - scripts/run_proxy.sh | 8 - 15 files changed, 748 deletions(-) delete mode 100644 .dvc/plots/confusion.json delete mode 100644 .dvc/plots/confusion_normalized.json delete mode 100644 .dvc/plots/linear.json delete mode 100644 .dvc/plots/scatter.json delete mode 100644 .dvc/plots/simple.json delete mode 100644 .dvc/plots/smooth.json rename artificial_detection/data/README.md => docs/data.md (100%) rename artificial_detection/README.md => docs/framework.md (100%) rename artificial_detection/models/README.md => docs/models.md (100%) delete mode 100644 scripts/bertscore.py delete mode 100644 scripts/load_bert_score.py delete mode 100644 scripts/load_bleurt.py delete mode 100644 scripts/load_comet.py delete mode 100644 scripts/load_xlm_large.py delete mode 100644 scripts/run_proxy.sh diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json deleted file mode 100644 index 84ec022..0000000 --- a/.dvc/plots/confusion.json +++ /dev/null @@ -1,107 +0,0 @@ -{ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "data": { - "values": "" - }, - "title": "", - "facet": { - "field": "rev", - "type": "nominal" - }, - "spec": { - "transform": [ - { - "aggregate": [ - { - "op": "count", - "as": "xy_count" - } - ], - "groupby": [ - "", - "" - ] - }, - { - "impute": "xy_count", - "groupby": [ - "rev", - "" - ], - "key": "", - "value": 0 - }, - { - "impute": "xy_count", - "groupby": [ - "rev", - "" - ], - "key": "", - "value": 0 - }, - { - "joinaggregate": [ - { - "op": "max", - "field": "xy_count", - "as": "max_count" - } - ], - "groupby": [] - }, - { - "calculate": "datum.xy_count / datum.max_count", - "as": "percent_of_max" - } - ], - "encoding": { - "x": { - "field": "", - "type": "nominal", - "sort": "ascending", - "title": "" - }, - "y": { - "field": "", - "type": "nominal", - "sort": "ascending", - "title": "" - } - }, - "layer": [ - { - "mark": "rect", - "width": 300, - "height": 300, - "encoding": { - "color": { - "field": "xy_count", - "type": "quantitative", - "title": "", - "scale": { - "domainMin": 0, - "nice": true - } - } - } - }, - { - "mark": "text", - "encoding": { - "text": { - "field": "xy_count", - "type": "quantitative" - }, - "color": { - "condition": { - "test": "datum.percent_of_max > 0.5", - "value": "white" - }, - "value": "black" - } - } - } - ] - } -} diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json deleted file mode 100644 index 92c7773..0000000 --- a/.dvc/plots/confusion_normalized.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "data": { - "values": "" - }, - "title": "", - "facet": { - "field": "rev", - "type": "nominal" - }, - "spec": { - "transform": [ - { - "aggregate": [ - { - "op": "count", - "as": "xy_count" - } - ], - "groupby": [ - "", - "" - ] - }, - { - "impute": "xy_count", - "groupby": [ - "rev", - "" - ], - "key": "", - "value": 0 - }, - { - "impute": "xy_count", - "groupby": [ - "rev", - "" - ], - "key": "", - "value": 0 - }, - { - "joinaggregate": [ - { - "op": "sum", - "field": "xy_count", - "as": "sum_y" - } - ], - "groupby": [ - "" - ] - }, - { - "calculate": "datum.xy_count / datum.sum_y", - "as": "percent_of_y" - } - ], - "encoding": { - "x": { - "field": "", - "type": "nominal", - "sort": "ascending", - "title": "" - }, - "y": { - "field": "", - "type": "nominal", - "sort": "ascending", - "title": "" - } - }, - "layer": [ - { - "mark": "rect", - "width": 300, - "height": 300, - "encoding": { - "color": { - "field": "percent_of_y", - "type": "quantitative", - "title": "", - "scale": { - "domain": [ - 0, - 1 - ] - } - } - } - }, - { - "mark": "text", - "encoding": { - "text": { - "field": "percent_of_y", - "type": "quantitative", - "format": ".2f" - }, - "color": { - "condition": { - "test": "datum.percent_of_y > 0.5", - "value": "white" - }, - "value": "black" - } - } - } - ] - } -} diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json deleted file mode 100644 index 970dc92..0000000 --- a/.dvc/plots/linear.json +++ /dev/null @@ -1,116 +0,0 @@ -{ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "data": { - "values": "" - }, - "title": "", - "width": 300, - "height": 300, - "layer": [ - { - "encoding": { - "x": { - "field": "", - "type": "quantitative", - "title": "" - }, - "y": { - "field": "", - "type": "quantitative", - "title": "", - "scale": { - "zero": false - } - }, - "color": { - "field": "rev", - "type": "nominal" - } - }, - "layer": [ - { - "mark": "line" - }, - { - "selection": { - "label": { - "type": "single", - "nearest": true, - "on": "mouseover", - "encodings": [ - "x" - ], - "empty": "none", - "clear": "mouseout" - } - }, - "mark": "point", - "encoding": { - "opacity": { - "condition": { - "selection": "label", - "value": 1 - }, - "value": 0 - } - } - } - ] - }, - { - "transform": [ - { - "filter": { - "selection": "label" - } - } - ], - "layer": [ - { - "mark": { - "type": "rule", - "color": "gray" - }, - "encoding": { - "x": { - "field": "", - "type": "quantitative" - } - } - }, - { - "encoding": { - "text": { - "type": "quantitative", - "field": "" - }, - "x": { - "field": "", - "type": "quantitative" - }, - "y": { - "field": "", - "type": "quantitative" - } - }, - "layer": [ - { - "mark": { - "type": "text", - "align": "left", - "dx": 5, - "dy": -5 - }, - "encoding": { - "color": { - "type": "nominal", - "field": "rev" - } - } - } - ] - } - ] - } - ] -} diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json deleted file mode 100644 index 6e8cf5b..0000000 --- a/.dvc/plots/scatter.json +++ /dev/null @@ -1,104 +0,0 @@ -{ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "data": { - "values": "" - }, - "title": "", - "width": 300, - "height": 300, - "layer": [ - { - "encoding": { - "x": { - "field": "", - "type": "quantitative", - "title": "" - }, - "y": { - "field": "", - "type": "quantitative", - "title": "", - "scale": { - "zero": false - } - }, - "color": { - "field": "rev", - "type": "nominal" - } - }, - "layer": [ - { - "mark": "point" - }, - { - "selection": { - "label": { - "type": "single", - "nearest": true, - "on": "mouseover", - "encodings": [ - "x" - ], - "empty": "none", - "clear": "mouseout" - } - }, - "mark": "point", - "encoding": { - "opacity": { - "condition": { - "selection": "label", - "value": 1 - }, - "value": 0 - } - } - } - ] - }, - { - "transform": [ - { - "filter": { - "selection": "label" - } - } - ], - "layer": [ - { - "encoding": { - "text": { - "type": "quantitative", - "field": "" - }, - "x": { - "field": "", - "type": "quantitative" - }, - "y": { - "field": "", - "type": "quantitative" - } - }, - "layer": [ - { - "mark": { - "type": "text", - "align": "left", - "dx": 5, - "dy": -5 - }, - "encoding": { - "color": { - "type": "nominal", - "field": "rev" - } - } - } - ] - } - ] - } - ] -} diff --git a/.dvc/plots/simple.json b/.dvc/plots/simple.json deleted file mode 100644 index 1cebce9..0000000 --- a/.dvc/plots/simple.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "data": { - "values": "" - }, - "title": "", - "width": 300, - "height": 300, - "mark": { - "type": "line" - }, - "encoding": { - "x": { - "field": "", - "type": "quantitative", - "title": "" - }, - "y": { - "field": "", - "type": "quantitative", - "title": "", - "scale": { - "zero": false - } - }, - "color": { - "field": "rev", - "type": "nominal" - } - } -} diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json deleted file mode 100644 index 42b1ecf..0000000 --- a/.dvc/plots/smooth.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", - "data": { - "values": "" - }, - "title": "", - "mark": { - "type": "line" - }, - "encoding": { - "x": { - "field": "", - "type": "quantitative", - "title": "" - }, - "y": { - "field": "", - "type": "quantitative", - "title": "", - "scale": { - "zero": false - } - }, - "color": { - "field": "rev", - "type": "nominal" - } - }, - "transform": [ - { - "loess": "", - "on": "", - "groupby": [ - "rev" - ], - "bandwidth": 0.3 - } - ] -} diff --git a/artificial_detection/data/README.md b/docs/data.md similarity index 100% rename from artificial_detection/data/README.md rename to docs/data.md diff --git a/artificial_detection/README.md b/docs/framework.md similarity index 100% rename from artificial_detection/README.md rename to docs/framework.md diff --git a/artificial_detection/models/README.md b/docs/models.md similarity index 100% rename from artificial_detection/models/README.md rename to docs/models.md diff --git a/scripts/bertscore.py b/scripts/bertscore.py deleted file mode 100644 index 89229e4..0000000 --- a/scripts/bertscore.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2020 The HuggingFace Datasets Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" BERTScore metric. """ - -import functools -from contextlib import contextmanager - -import bert_score -import datasets -from packaging import version - - -@contextmanager -def filter_logging_context(): - def filter_log(record): - return False if "This IS expected if you are initializing" in record.msg else True - - logger = datasets.utils.logging.get_logger("transformers.modeling_utils") - logger.addFilter(filter_log) - try: - yield - finally: - logger.removeFilter(filter_log) - - -_CITATION = """\ -@inproceedings{bert-score, - title={BERTScore: Evaluating Text Generation with BERT}, - author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi}, - booktitle={International Conference on Learning Representations}, - year={2020}, - url={https://openreview.net/forum?id=SkeHuCVFDr} -} -""" - -_DESCRIPTION = """\ -BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference -sentences by cosine similarity. -It has been shown to correlate with human judgment on sentence-level and system-level evaluation. -Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different language -generation tasks. - -See the project's README at https://github.com/Tiiiger/bert_score#readme for more information. -""" - -_KWARGS_DESCRIPTION = """ -BERTScore Metrics with the hashcode from a source against one or more references. - -Args: - predictions (list of str): Prediction/candidate sentences. - references (list of str or list of list of str): Reference sentences. - lang (str): Language of the sentences; required (e.g. 'en'). - model_type (str): Bert specification, default using the suggested - model for the target language; has to specify at least one of - `model_type` or `lang`. - num_layers (int): The layer of representation to use, - default using the number of layers tuned on WMT16 correlation data. - verbose (bool): Turn on intermediate status update. - idf (bool or dict): Use idf weighting; can also be a precomputed idf_dict. - device (str): On which the contextual embedding model will be allocated on. - If this argument is None, the model lives on cuda:0 if cuda is available. - nthreads (int): Number of threads. - batch_size (int): Bert score processing batch size, - at least one of `model_type` or `lang`. `lang` needs to be - specified when `rescale_with_baseline` is True. - rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline. - baseline_path (str): Customized baseline file. - use_fast_tokenizer (bool): `use_fast` parameter passed to HF tokenizer. New in version 0.3.10. - -Returns: - precision: Precision. - recall: Recall. - f1: F1 score. - hashcode: Hashcode of the library. - -Examples: - - >>> predictions = ["hello there", "general kenobi"] - >>> references = ["hello there", "general kenobi"] - >>> bertscore = datasets.load_metric("bertscore") - >>> results = bertscore.compute(predictions=predictions, references=references, lang="en") - >>> print([round(v, 2) for v in results["f1"]]) - [1.0, 1.0] -""" - - -@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class BERTScore(datasets.Metric): - def _info(self): - return datasets.MetricInfo( - description=_DESCRIPTION, - citation=_CITATION, - homepage="https://github.com/Tiiiger/bert_score", - inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), - } - ), - codebase_urls=["https://github.com/Tiiiger/bert_score"], - reference_urls=[ - "https://github.com/Tiiiger/bert_score", - "https://arxiv.org/abs/1904.09675", - ], - ) - - def _compute( - self, - predictions, - references, - lang=None, - model_type=None, - num_layers=None, - verbose=False, - idf=False, - device=None, - batch_size=64, - nthreads=4, - all_layers=False, - rescale_with_baseline=False, - baseline_path=None, - use_fast_tokenizer=False, - ): - get_hash = bert_score.utils.get_hash - scorer = bert_score.BERTScorer - - if version.parse(bert_score.__version__) >= version.parse("0.3.10"): - get_hash = functools.partial(get_hash, use_fast_tokenizer=use_fast_tokenizer) - scorer = functools.partial(scorer, use_fast_tokenizer=use_fast_tokenizer) - elif use_fast_tokenizer: - raise ImportWarning( - "To use a fast tokenizer, the module `bert-score>=0.3.10` is required, and the current version of `bert-score` doesn't match this condition.\n" - 'You can install it with `pip install "bert-score>=0.3.10"`.' - ) - - if model_type is None: - assert lang is not None, "either lang or model_type should be specified" - model_type = bert_score.utils.lang2model[lang.lower()] - - if num_layers is None: - num_layers = bert_score.utils.model2layers[model_type] - - hashcode = get_hash( - model=model_type, - num_layers=num_layers, - idf=idf, - rescale_with_baseline=rescale_with_baseline, - use_custom_baseline=baseline_path is not None, - ) - - with filter_logging_context(): - if not hasattr(self, "cached_bertscorer") or self.cached_bertscorer.hash != hashcode: - self.cached_bertscorer = scorer( - model_type=model_type, - num_layers=num_layers, - batch_size=batch_size, - nthreads=nthreads, - all_layers=all_layers, - idf=idf, - device=device, - lang=lang, - rescale_with_baseline=rescale_with_baseline, - baseline_path=baseline_path, - ) - - (P, R, F) = self.cached_bertscorer.score( - cands=predictions, - refs=references, - verbose=verbose, - batch_size=batch_size, - ) - output_dict = { - "precision": P.tolist(), - "recall": R.tolist(), - "f1": F.tolist(), - "hashcode": hashcode, - } - return output_dict - - def add_batch(self, predictions=None, references=None, **kwargs): - """Add a batch of predictions and references for the metric's stack.""" - # References can be strings or lists of strings - # Let's change strings to lists of strings with one element - if references is not None: - references = [[ref] if isinstance(ref, str) else ref for ref in references] - super().add_batch(predictions=predictions, references=references, **kwargs) - - def add(self, prediction=None, reference=None, **kwargs): - """Add one prediction and reference for the metric's stack.""" - # References can be strings or lists of strings - # Let's change strings to lists of strings with one element - if isinstance(reference, str): - reference = [reference] - super().add(prediction=prediction, reference=reference, **kwargs) diff --git a/scripts/load_bert_score.py b/scripts/load_bert_score.py deleted file mode 100644 index b186ddf..0000000 --- a/scripts/load_bert_score.py +++ /dev/null @@ -1,6 +0,0 @@ -from datasets import load_metric - -bert_score_metric = load_metric("bertscore") -print(bert_score_metric) - -# $HOME/.cache/huggingface/metrics/bert_score diff --git a/scripts/load_bleurt.py b/scripts/load_bleurt.py deleted file mode 100644 index 1162c48..0000000 --- a/scripts/load_bleurt.py +++ /dev/null @@ -1,6 +0,0 @@ -from datasets import load_metric - -bleurt = load_metric("bleurt", "BLEURT-20") - -print(bleurt) -# $HOME/.cache/huggingface/metrics/bleurt/BLEURT-20 diff --git a/scripts/load_comet.py b/scripts/load_comet.py deleted file mode 100644 index 988e811..0000000 --- a/scripts/load_comet.py +++ /dev/null @@ -1,6 +0,0 @@ -from artificial_detection.data.proxy import CometMetrics - -comet_model, model_path = CometMetrics.load_offline() - -print("model_path:", model_path) -# $HOME/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt diff --git a/scripts/load_xlm_large.py b/scripts/load_xlm_large.py deleted file mode 100644 index a180fd5..0000000 --- a/scripts/load_xlm_large.py +++ /dev/null @@ -1,7 +0,0 @@ -from transformers import XLMRobertaModel, XLMRobertaTokenizer - -tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large") -model = XLMRobertaModel.from_pretrained( - "xlm-roberta-large", add_pooling_layer=False -) -# $HOME/atd-models/xlm-roberta-large diff --git a/scripts/run_proxy.sh b/scripts/run_proxy.sh deleted file mode 100644 index 45acd1c..0000000 --- a/scripts/run_proxy.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -python artificial_detection/data/proxy.py \ - --df_path="$HOME/atd-data/metrics_checkpoint_merged_df.tsv" \ - --metrics_names="$1" \ - --model_path="$2" \ - --baseline_path="$3" \ - --output_path="$HOME/atd-data/collected_metrics_$1.tsv" From 56656385ad2cf99bb9705d48e885efb201add213 Mon Sep 17 00:00:00 2001 From: Marat Saidov Date: Mon, 2 Jan 2023 14:03:37 +0700 Subject: [PATCH 2/3] docs refactoring and todos for bad snippets --- README.md | 51 +++++++------------- artificial_detection/pipelines/supervised.py | 1 + docs/data.md | 33 +++++++++++++ tests/test_translate.py | 6 --- 4 files changed, 52 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index ae84d33..8a54d4d 100644 --- a/README.md +++ b/README.md @@ -23,58 +23,43 @@ NLP approaches to compare natural text against generated by neural networks. ### Contents -Project description is put into: +This work has the extensive documentation which is divided into following parts: -- [Framework Description Markdown](https://github.com/MaratSaidov/artificial-text-detection/blob/main/detection/README.md) -- [Data Description Markdown](https://github.com/MaratSaidov/artificial-text-detection/blob/main/detection/data/README.md) -- [Models Description Markdown](https://github.com/MaratSaidov/artificial-text-detection/blob/main/detection/data/README.md) +- [ATD framework](docs/framework.md). This section is about the general way to detect artificial texts. + It consists of the overview of end-to-end pipelines list for this task. +- [Data](docs/data.md). Textual domains were aggregated in the specific way. + This section describes the generative part of the task. +- [Discriminator models](docs/models.md). This section describes the discriminative part of the task. ### Installation steps: -We use [`poetry`](https://python-poetry.org/) as an enhanced dependency resolver. - +We use [`poetry`](https://python-poetry.org/) as a dependency resolver. ```bash make poetry-download poetry install --no-dev ``` -### Datasets for artificial text detection - -To create datasets for the further classification, it is necessary to collect them. -There are 2 available ways for it: - -- Via [Data Version Control](https://dvc.org/). -Get in touch with [`@msaidov`](https://t.me/msaidov) in order to have the access to the private Google Drive; -- Via datasets generation. One dataset with a size of 20,000 samples was process with MT model on V100 GPU for 30 mins; - -### Data Version Control usage: - -```bash -poetry add "dvc[gdrive]" -``` - -Then, run `dvc pull`. It will download preprocessed translation datasets -from the Google Drive. - -### Datasets generation - -To generate translations before artificial text detection pipeline, -install the `detection` module from the cloned repo or PyPi (TODO): -```bash -pip install -e . -``` -Then, run generate script: +Make sure that `artificial_detection` library is pre-configured in your environment: ```bash -python detection/data/generate.py --dataset_name='tatoeba' --size=20000 --device='cuda:0' +pip show artificial-detection ``` ### Simple run: To run the artificial text detection classifier, execute the pipeline: +# TODO: improve runners ```bash python detection/old.py ``` + +### DVC for prototypes + +[DVC](https://dvc.org/) is used to pull and update small- and medium-sized samples into remote storage. +Google Drive has been chosen as a remote storage. +Credentials are hidden. Reach out the maintainer for contributions and prototyping support. + + [build_status_badge]: https://github.com/MaratSaidov/artificial-text-detection/actions/workflows/build.yml/badge.svg [build_status_link]: https://github.com/MaratSaidov/artificial-text-detection/actions/workflows/build.yml diff --git a/artificial_detection/pipelines/supervised.py b/artificial_detection/pipelines/supervised.py index 12944cf..e616aa6 100644 --- a/artificial_detection/pipelines/supervised.py +++ b/artificial_detection/pipelines/supervised.py @@ -44,6 +44,7 @@ def read_splits(df, as_datasets): def prepare_data(tokenizer): + # TODO: get rid of prefixes which include my root data_path = "/home/masaidov/atd-data/metrics_df.tsv" df = pd.read_csv(data_path, sep="\t") df = df[["text", "label", "subset"]] diff --git a/docs/data.md b/docs/data.md index d51771d..e8128a7 100644 --- a/docs/data.md +++ b/docs/data.md @@ -31,3 +31,36 @@ dvc push ``` Do not forget to commit `data.dvc` file in order to pull processed datasets later. + + +# TODO: refactor later on + +### Datasets for artificial text detection + +To create datasets for the further classification, it is necessary to collect them. +There are 2 available ways for it: + +- Via [Data Version Control](https://dvc.org/). +Get in touch with [`@msaidov`](https://t.me/msaidov) in order to have the access to the private Google Drive; +- Via datasets generation. One dataset with a size of 20,000 samples was process with MT model on V100 GPU for 30 mins; + +### Data Version Control usage: + +```bash +poetry add "dvc[gdrive]" +``` + +Then, run `dvc pull`. It will download preprocessed translation datasets +from the Google Drive. + +### Datasets generation + +To generate translations before artificial text detection pipeline, +install the `detection` module from the cloned repo or PyPi (TODO): +```bash +pip install -e . +``` +Then, run generate script: +```bash +python detection/data/generate.py --dataset_name='tatoeba' --size=20000 --device='cuda:0' +``` diff --git a/tests/test_translate.py b/tests/test_translate.py index de5c87f..dd2f7d0 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -6,7 +6,6 @@ from artificial_detection.models.translation import TranslationModel from artificial_detection.utils import get_dataset_path, save_translations_texts -from tests import skip_github class TestTranslate(TestCase): @@ -55,8 +54,3 @@ def test_save_to_csv(self) -> None: assert_that(df_sample.columns.tolist(), has_items(*["sources", "targets", "translations"])) assert_that(len(df_sample), equal_to(5)) os.remove(dataset_path) - - @skip_github - def test_gpu_usage(self) -> None: - # TODO - pass From a36e58e442bc9d2c8f80b568ea3154ee37fde87e Mon Sep 17 00:00:00 2001 From: Marat Saidov Date: Mon, 2 Jan 2023 20:56:45 +0700 Subject: [PATCH 3/3] templated docs --- docs/data.md | 22 ++++++++++++++++++++-- docs/framework.md | 16 +++------------- docs/models.md | 4 +++- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/docs/data.md b/docs/data.md index e8128a7..2b72087 100644 --- a/docs/data.md +++ b/docs/data.md @@ -1,6 +1,8 @@ -# Data generation description +# Data + +A part of the documentation related to datasets and a generative part. + -Some additional information about datasets. ### Data naming rules: @@ -64,3 +66,19 @@ Then, run generate script: ```bash python detection/data/generate.py --dataset_name='tatoeba' --size=20000 --device='cuda:0' ``` + +TODO: refactoring + +The purpose of the Detection framework is to generalize artificial text detection approaches. + +At the current stage this framework provides the support of: + +- Datasets from [tatoeba](https://huggingface.co/datasets/tatoeba), +[WikiMatrix](https://github.com/facebookresearch/LASER/tree/main/tasks/WikiMatrix); +- Models from [EasyNMT](https://github.com/UKPLab/EasyNMT). + +Here is a list of supported languages: + +| Source language | Target Language | Dataset | +| :---: | :---: | :---: | +| Russian | English | Tatoeba | diff --git a/docs/framework.md b/docs/framework.md index 07f49de..5ffa560 100644 --- a/docs/framework.md +++ b/docs/framework.md @@ -1,15 +1,5 @@ -# Detection framework +# Artificial Text Detection: Pipelines and General Framework -The purpose of the Detection framework is to generalize artificial text detection approaches. +Here we describe the pipeline structure and how we dealt with multiple domains. -At the current stage this framework provides the support of: - -- Datasets from [tatoeba](https://huggingface.co/datasets/tatoeba), -[WikiMatrix](https://github.com/facebookresearch/LASER/tree/main/tasks/WikiMatrix); -- Models from [EasyNMT](https://github.com/UKPLab/EasyNMT). - -Here is a list of supported languages: - -| Source language | Target Language | Dataset | -| :---: | :---: | :---: | -| Russian | English | Tatoeba | +TODO diff --git a/docs/models.md b/docs/models.md index 19d9ca6..8d0a721 100644 --- a/docs/models.md +++ b/docs/models.md @@ -1,4 +1,6 @@ -# Models description +# Discriminative modeling + +TODO: we train discriminators... Primarly, we use a wrapper on machine translation models from [`EasyNMT`](https://github.com/UKPLab/EasyNMT).