From 080c26edd64c149e1a3d212fbbf3e1c54b751516 Mon Sep 17 00:00:00 2001
From: Marat Saidov <msaidov1@yandex.ru>
Date: Mon, 2 Jan 2023 12:06:43 +0700
Subject: [PATCH 1/3] refactoring for atd repo

---
 .dvc/plots/confusion.json                     | 107 ---------
 .dvc/plots/confusion_normalized.json          | 112 ----------
 .dvc/plots/linear.json                        | 116 ----------
 .dvc/plots/scatter.json                       | 104 ---------
 .dvc/plots/simple.json                        |  31 ---
 .dvc/plots/smooth.json                        |  39 ----
 .../data/README.md => docs/data.md            |   0
 .../README.md => docs/framework.md            |   0
 .../models/README.md => docs/models.md        |   0
 scripts/bertscore.py                          | 206 ------------------
 scripts/load_bert_score.py                    |   6 -
 scripts/load_bleurt.py                        |   6 -
 scripts/load_comet.py                         |   6 -
 scripts/load_xlm_large.py                     |   7 -
 scripts/run_proxy.sh                          |   8 -
 15 files changed, 748 deletions(-)
 delete mode 100644 .dvc/plots/confusion.json
 delete mode 100644 .dvc/plots/confusion_normalized.json
 delete mode 100644 .dvc/plots/linear.json
 delete mode 100644 .dvc/plots/scatter.json
 delete mode 100644 .dvc/plots/simple.json
 delete mode 100644 .dvc/plots/smooth.json
 rename artificial_detection/data/README.md => docs/data.md (100%)
 rename artificial_detection/README.md => docs/framework.md (100%)
 rename artificial_detection/models/README.md => docs/models.md (100%)
 delete mode 100644 scripts/bertscore.py
 delete mode 100644 scripts/load_bert_score.py
 delete mode 100644 scripts/load_bleurt.py
 delete mode 100644 scripts/load_comet.py
 delete mode 100644 scripts/load_xlm_large.py
 delete mode 100644 scripts/run_proxy.sh

diff --git a/.dvc/plots/confusion.json b/.dvc/plots/confusion.json
deleted file mode 100644
index 84ec022..0000000
--- a/.dvc/plots/confusion.json
+++ /dev/null
@@ -1,107 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "facet": {
-        "field": "rev",
-        "type": "nominal"
-    },
-    "spec": {
-        "transform": [
-            {
-                "aggregate": [
-                    {
-                        "op": "count",
-                        "as": "xy_count"
-                    }
-                ],
-                "groupby": [
-                    "<DVC_METRIC_Y>",
-                    "<DVC_METRIC_X>"
-                ]
-            },
-            {
-                "impute": "xy_count",
-                "groupby": [
-                    "rev",
-                    "<DVC_METRIC_Y>"
-                ],
-                "key": "<DVC_METRIC_X>",
-                "value": 0
-            },
-            {
-                "impute": "xy_count",
-                "groupby": [
-                    "rev",
-                    "<DVC_METRIC_X>"
-                ],
-                "key": "<DVC_METRIC_Y>",
-                "value": 0
-            },
-            {
-                "joinaggregate": [
-                    {
-                        "op": "max",
-                        "field": "xy_count",
-                        "as": "max_count"
-                    }
-                ],
-                "groupby": []
-            },
-            {
-                "calculate": "datum.xy_count / datum.max_count",
-                "as": "percent_of_max"
-            }
-        ],
-        "encoding": {
-            "x": {
-                "field": "<DVC_METRIC_X>",
-                "type": "nominal",
-                "sort": "ascending",
-                "title": "<DVC_METRIC_X_LABEL>"
-            },
-            "y": {
-                "field": "<DVC_METRIC_Y>",
-                "type": "nominal",
-                "sort": "ascending",
-                "title": "<DVC_METRIC_Y_LABEL>"
-            }
-        },
-        "layer": [
-            {
-                "mark": "rect",
-                "width": 300,
-                "height": 300,
-                "encoding": {
-                    "color": {
-                        "field": "xy_count",
-                        "type": "quantitative",
-                        "title": "",
-                        "scale": {
-                            "domainMin": 0,
-                            "nice": true
-                        }
-                    }
-                }
-            },
-            {
-                "mark": "text",
-                "encoding": {
-                    "text": {
-                        "field": "xy_count",
-                        "type": "quantitative"
-                    },
-                    "color": {
-                        "condition": {
-                            "test": "datum.percent_of_max > 0.5",
-                            "value": "white"
-                        },
-                        "value": "black"
-                    }
-                }
-            }
-        ]
-    }
-}
diff --git a/.dvc/plots/confusion_normalized.json b/.dvc/plots/confusion_normalized.json
deleted file mode 100644
index 92c7773..0000000
--- a/.dvc/plots/confusion_normalized.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "facet": {
-        "field": "rev",
-        "type": "nominal"
-    },
-    "spec": {
-        "transform": [
-            {
-                "aggregate": [
-                    {
-                        "op": "count",
-                        "as": "xy_count"
-                    }
-                ],
-                "groupby": [
-                    "<DVC_METRIC_Y>",
-                    "<DVC_METRIC_X>"
-                ]
-            },
-            {
-                "impute": "xy_count",
-                "groupby": [
-                    "rev",
-                    "<DVC_METRIC_Y>"
-                ],
-                "key": "<DVC_METRIC_X>",
-                "value": 0
-            },
-            {
-                "impute": "xy_count",
-                "groupby": [
-                    "rev",
-                    "<DVC_METRIC_X>"
-                ],
-                "key": "<DVC_METRIC_Y>",
-                "value": 0
-            },
-            {
-                "joinaggregate": [
-                    {
-                        "op": "sum",
-                        "field": "xy_count",
-                        "as": "sum_y"
-                    }
-                ],
-                "groupby": [
-                    "<DVC_METRIC_Y>"
-                ]
-            },
-            {
-                "calculate": "datum.xy_count / datum.sum_y",
-                "as": "percent_of_y"
-            }
-        ],
-        "encoding": {
-            "x": {
-                "field": "<DVC_METRIC_X>",
-                "type": "nominal",
-                "sort": "ascending",
-                "title": "<DVC_METRIC_X_LABEL>"
-            },
-            "y": {
-                "field": "<DVC_METRIC_Y>",
-                "type": "nominal",
-                "sort": "ascending",
-                "title": "<DVC_METRIC_Y_LABEL>"
-            }
-        },
-        "layer": [
-            {
-                "mark": "rect",
-                "width": 300,
-                "height": 300,
-                "encoding": {
-                    "color": {
-                        "field": "percent_of_y",
-                        "type": "quantitative",
-                        "title": "",
-                        "scale": {
-                            "domain": [
-                                0,
-                                1
-                            ]
-                        }
-                    }
-                }
-            },
-            {
-                "mark": "text",
-                "encoding": {
-                    "text": {
-                        "field": "percent_of_y",
-                        "type": "quantitative",
-                        "format": ".2f"
-                    },
-                    "color": {
-                        "condition": {
-                            "test": "datum.percent_of_y > 0.5",
-                            "value": "white"
-                        },
-                        "value": "black"
-                    }
-                }
-            }
-        ]
-    }
-}
diff --git a/.dvc/plots/linear.json b/.dvc/plots/linear.json
deleted file mode 100644
index 970dc92..0000000
--- a/.dvc/plots/linear.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "width": 300,
-    "height": 300,
-    "layer": [
-        {
-            "encoding": {
-                "x": {
-                    "field": "<DVC_METRIC_X>",
-                    "type": "quantitative",
-                    "title": "<DVC_METRIC_X_LABEL>"
-                },
-                "y": {
-                    "field": "<DVC_METRIC_Y>",
-                    "type": "quantitative",
-                    "title": "<DVC_METRIC_Y_LABEL>",
-                    "scale": {
-                        "zero": false
-                    }
-                },
-                "color": {
-                    "field": "rev",
-                    "type": "nominal"
-                }
-            },
-            "layer": [
-                {
-                    "mark": "line"
-                },
-                {
-                    "selection": {
-                        "label": {
-                            "type": "single",
-                            "nearest": true,
-                            "on": "mouseover",
-                            "encodings": [
-                                "x"
-                            ],
-                            "empty": "none",
-                            "clear": "mouseout"
-                        }
-                    },
-                    "mark": "point",
-                    "encoding": {
-                        "opacity": {
-                            "condition": {
-                                "selection": "label",
-                                "value": 1
-                            },
-                            "value": 0
-                        }
-                    }
-                }
-            ]
-        },
-        {
-            "transform": [
-                {
-                    "filter": {
-                        "selection": "label"
-                    }
-                }
-            ],
-            "layer": [
-                {
-                    "mark": {
-                        "type": "rule",
-                        "color": "gray"
-                    },
-                    "encoding": {
-                        "x": {
-                            "field": "<DVC_METRIC_X>",
-                            "type": "quantitative"
-                        }
-                    }
-                },
-                {
-                    "encoding": {
-                        "text": {
-                            "type": "quantitative",
-                            "field": "<DVC_METRIC_Y>"
-                        },
-                        "x": {
-                            "field": "<DVC_METRIC_X>",
-                            "type": "quantitative"
-                        },
-                        "y": {
-                            "field": "<DVC_METRIC_Y>",
-                            "type": "quantitative"
-                        }
-                    },
-                    "layer": [
-                        {
-                            "mark": {
-                                "type": "text",
-                                "align": "left",
-                                "dx": 5,
-                                "dy": -5
-                            },
-                            "encoding": {
-                                "color": {
-                                    "type": "nominal",
-                                    "field": "rev"
-                                }
-                            }
-                        }
-                    ]
-                }
-            ]
-        }
-    ]
-}
diff --git a/.dvc/plots/scatter.json b/.dvc/plots/scatter.json
deleted file mode 100644
index 6e8cf5b..0000000
--- a/.dvc/plots/scatter.json
+++ /dev/null
@@ -1,104 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "width": 300,
-    "height": 300,
-    "layer": [
-        {
-            "encoding": {
-                "x": {
-                    "field": "<DVC_METRIC_X>",
-                    "type": "quantitative",
-                    "title": "<DVC_METRIC_X_LABEL>"
-                },
-                "y": {
-                    "field": "<DVC_METRIC_Y>",
-                    "type": "quantitative",
-                    "title": "<DVC_METRIC_Y_LABEL>",
-                    "scale": {
-                        "zero": false
-                    }
-                },
-                "color": {
-                    "field": "rev",
-                    "type": "nominal"
-                }
-            },
-            "layer": [
-                {
-                    "mark": "point"
-                },
-                {
-                    "selection": {
-                        "label": {
-                            "type": "single",
-                            "nearest": true,
-                            "on": "mouseover",
-                            "encodings": [
-                                "x"
-                            ],
-                            "empty": "none",
-                            "clear": "mouseout"
-                        }
-                    },
-                    "mark": "point",
-                    "encoding": {
-                        "opacity": {
-                            "condition": {
-                                "selection": "label",
-                                "value": 1
-                            },
-                            "value": 0
-                        }
-                    }
-                }
-            ]
-        },
-        {
-            "transform": [
-                {
-                    "filter": {
-                        "selection": "label"
-                    }
-                }
-            ],
-            "layer": [
-                {
-                    "encoding": {
-                        "text": {
-                            "type": "quantitative",
-                            "field": "<DVC_METRIC_Y>"
-                        },
-                        "x": {
-                            "field": "<DVC_METRIC_X>",
-                            "type": "quantitative"
-                        },
-                        "y": {
-                            "field": "<DVC_METRIC_Y>",
-                            "type": "quantitative"
-                        }
-                    },
-                    "layer": [
-                        {
-                            "mark": {
-                                "type": "text",
-                                "align": "left",
-                                "dx": 5,
-                                "dy": -5
-                            },
-                            "encoding": {
-                                "color": {
-                                    "type": "nominal",
-                                    "field": "rev"
-                                }
-                            }
-                        }
-                    ]
-                }
-            ]
-        }
-    ]
-}
diff --git a/.dvc/plots/simple.json b/.dvc/plots/simple.json
deleted file mode 100644
index 1cebce9..0000000
--- a/.dvc/plots/simple.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "width": 300,
-    "height": 300,
-    "mark": {
-        "type": "line"
-    },
-    "encoding": {
-        "x": {
-            "field": "<DVC_METRIC_X>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_X_LABEL>"
-        },
-        "y": {
-            "field": "<DVC_METRIC_Y>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_Y_LABEL>",
-            "scale": {
-                "zero": false
-            }
-        },
-        "color": {
-            "field": "rev",
-            "type": "nominal"
-        }
-    }
-}
diff --git a/.dvc/plots/smooth.json b/.dvc/plots/smooth.json
deleted file mode 100644
index 42b1ecf..0000000
--- a/.dvc/plots/smooth.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-    "data": {
-        "values": "<DVC_METRIC_DATA>"
-    },
-    "title": "<DVC_METRIC_TITLE>",
-    "mark": {
-        "type": "line"
-    },
-    "encoding": {
-        "x": {
-            "field": "<DVC_METRIC_X>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_X_LABEL>"
-        },
-        "y": {
-            "field": "<DVC_METRIC_Y>",
-            "type": "quantitative",
-            "title": "<DVC_METRIC_Y_LABEL>",
-            "scale": {
-                "zero": false
-            }
-        },
-        "color": {
-            "field": "rev",
-            "type": "nominal"
-        }
-    },
-    "transform": [
-        {
-            "loess": "<DVC_METRIC_Y>",
-            "on": "<DVC_METRIC_X>",
-            "groupby": [
-                "rev"
-            ],
-            "bandwidth": 0.3
-        }
-    ]
-}
diff --git a/artificial_detection/data/README.md b/docs/data.md
similarity index 100%
rename from artificial_detection/data/README.md
rename to docs/data.md
diff --git a/artificial_detection/README.md b/docs/framework.md
similarity index 100%
rename from artificial_detection/README.md
rename to docs/framework.md
diff --git a/artificial_detection/models/README.md b/docs/models.md
similarity index 100%
rename from artificial_detection/models/README.md
rename to docs/models.md
diff --git a/scripts/bertscore.py b/scripts/bertscore.py
deleted file mode 100644
index 89229e4..0000000
--- a/scripts/bertscore.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" BERTScore metric. """
-
-import functools
-from contextlib import contextmanager
-
-import bert_score
-import datasets
-from packaging import version
-
-
-@contextmanager
-def filter_logging_context():
-    def filter_log(record):
-        return False if "This IS expected if you are initializing" in record.msg else True
-
-    logger = datasets.utils.logging.get_logger("transformers.modeling_utils")
-    logger.addFilter(filter_log)
-    try:
-        yield
-    finally:
-        logger.removeFilter(filter_log)
-
-
-_CITATION = """\
-@inproceedings{bert-score,
-  title={BERTScore: Evaluating Text Generation with BERT},
-  author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
-  booktitle={International Conference on Learning Representations},
-  year={2020},
-  url={https://openreview.net/forum?id=SkeHuCVFDr}
-}
-"""
-
-_DESCRIPTION = """\
-BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference
-sentences by cosine similarity.
-It has been shown to correlate with human judgment on sentence-level and system-level evaluation.
-Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different language
-generation tasks.
-
-See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
-"""
-
-_KWARGS_DESCRIPTION = """
-BERTScore Metrics with the hashcode from a source against one or more references.
-
-Args:
-    predictions (list of str): Prediction/candidate sentences.
-    references (list of str or list of list of str): Reference sentences.
-    lang (str): Language of the sentences; required (e.g. 'en').
-    model_type (str): Bert specification, default using the suggested
-        model for the target language; has to specify at least one of
-        `model_type` or `lang`.
-    num_layers (int): The layer of representation to use,
-        default using the number of layers tuned on WMT16 correlation data.
-    verbose (bool): Turn on intermediate status update.
-    idf (bool or dict): Use idf weighting; can also be a precomputed idf_dict.
-    device (str): On which the contextual embedding model will be allocated on.
-        If this argument is None, the model lives on cuda:0 if cuda is available.
-    nthreads (int): Number of threads.
-    batch_size (int): Bert score processing batch size,
-        at least one of `model_type` or `lang`. `lang` needs to be
-        specified when `rescale_with_baseline` is True.
-    rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline.
-    baseline_path (str): Customized baseline file.
-    use_fast_tokenizer (bool): `use_fast` parameter passed to HF tokenizer. New in version 0.3.10.
-
-Returns:
-    precision: Precision.
-    recall: Recall.
-    f1: F1 score.
-    hashcode: Hashcode of the library.
-
-Examples:
-
-    >>> predictions = ["hello there", "general kenobi"]
-    >>> references = ["hello there", "general kenobi"]
-    >>> bertscore = datasets.load_metric("bertscore")
-    >>> results = bertscore.compute(predictions=predictions, references=references, lang="en")
-    >>> print([round(v, 2) for v in results["f1"]])
-    [1.0, 1.0]
-"""
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class BERTScore(datasets.Metric):
-    def _info(self):
-        return datasets.MetricInfo(
-            description=_DESCRIPTION,
-            citation=_CITATION,
-            homepage="https://github.com/Tiiiger/bert_score",
-            inputs_description=_KWARGS_DESCRIPTION,
-            features=datasets.Features(
-                {
-                    "predictions": datasets.Value("string", id="sequence"),
-                    "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
-                }
-            ),
-            codebase_urls=["https://github.com/Tiiiger/bert_score"],
-            reference_urls=[
-                "https://github.com/Tiiiger/bert_score",
-                "https://arxiv.org/abs/1904.09675",
-            ],
-        )
-
-    def _compute(
-        self,
-        predictions,
-        references,
-        lang=None,
-        model_type=None,
-        num_layers=None,
-        verbose=False,
-        idf=False,
-        device=None,
-        batch_size=64,
-        nthreads=4,
-        all_layers=False,
-        rescale_with_baseline=False,
-        baseline_path=None,
-        use_fast_tokenizer=False,
-    ):
-        get_hash = bert_score.utils.get_hash
-        scorer = bert_score.BERTScorer
-
-        if version.parse(bert_score.__version__) >= version.parse("0.3.10"):
-            get_hash = functools.partial(get_hash, use_fast_tokenizer=use_fast_tokenizer)
-            scorer = functools.partial(scorer, use_fast_tokenizer=use_fast_tokenizer)
-        elif use_fast_tokenizer:
-            raise ImportWarning(
-                "To use a fast tokenizer, the module `bert-score>=0.3.10` is required, and the current version of `bert-score` doesn't match this condition.\n"
-                'You can install it with `pip install "bert-score>=0.3.10"`.'
-            )
-
-        if model_type is None:
-            assert lang is not None, "either lang or model_type should be specified"
-            model_type = bert_score.utils.lang2model[lang.lower()]
-
-        if num_layers is None:
-            num_layers = bert_score.utils.model2layers[model_type]
-
-        hashcode = get_hash(
-            model=model_type,
-            num_layers=num_layers,
-            idf=idf,
-            rescale_with_baseline=rescale_with_baseline,
-            use_custom_baseline=baseline_path is not None,
-        )
-
-        with filter_logging_context():
-            if not hasattr(self, "cached_bertscorer") or self.cached_bertscorer.hash != hashcode:
-                self.cached_bertscorer = scorer(
-                    model_type=model_type,
-                    num_layers=num_layers,
-                    batch_size=batch_size,
-                    nthreads=nthreads,
-                    all_layers=all_layers,
-                    idf=idf,
-                    device=device,
-                    lang=lang,
-                    rescale_with_baseline=rescale_with_baseline,
-                    baseline_path=baseline_path,
-                )
-
-        (P, R, F) = self.cached_bertscorer.score(
-            cands=predictions,
-            refs=references,
-            verbose=verbose,
-            batch_size=batch_size,
-        )
-        output_dict = {
-            "precision": P.tolist(),
-            "recall": R.tolist(),
-            "f1": F.tolist(),
-            "hashcode": hashcode,
-        }
-        return output_dict
-
-    def add_batch(self, predictions=None, references=None, **kwargs):
-        """Add a batch of predictions and references for the metric's stack."""
-        # References can be strings or lists of strings
-        # Let's change strings to lists of strings with one element
-        if references is not None:
-            references = [[ref] if isinstance(ref, str) else ref for ref in references]
-        super().add_batch(predictions=predictions, references=references, **kwargs)
-
-    def add(self, prediction=None, reference=None, **kwargs):
-        """Add one prediction and reference for the metric's stack."""
-        # References can be strings or lists of strings
-        # Let's change strings to lists of strings with one element
-        if isinstance(reference, str):
-            reference = [reference]
-        super().add(prediction=prediction, reference=reference, **kwargs)
diff --git a/scripts/load_bert_score.py b/scripts/load_bert_score.py
deleted file mode 100644
index b186ddf..0000000
--- a/scripts/load_bert_score.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from datasets import load_metric
-
-bert_score_metric = load_metric("bertscore")
-print(bert_score_metric)
-
-# $HOME/.cache/huggingface/metrics/bert_score
diff --git a/scripts/load_bleurt.py b/scripts/load_bleurt.py
deleted file mode 100644
index 1162c48..0000000
--- a/scripts/load_bleurt.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from datasets import load_metric
-
-bleurt = load_metric("bleurt", "BLEURT-20")
-
-print(bleurt)
-# $HOME/.cache/huggingface/metrics/bleurt/BLEURT-20
diff --git a/scripts/load_comet.py b/scripts/load_comet.py
deleted file mode 100644
index 988e811..0000000
--- a/scripts/load_comet.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from artificial_detection.data.proxy import CometMetrics
-
-comet_model, model_path = CometMetrics.load_offline()
-
-print("model_path:", model_path)
-# $HOME/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt
diff --git a/scripts/load_xlm_large.py b/scripts/load_xlm_large.py
deleted file mode 100644
index a180fd5..0000000
--- a/scripts/load_xlm_large.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from transformers import XLMRobertaModel, XLMRobertaTokenizer
-
-tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
-model = XLMRobertaModel.from_pretrained(
-    "xlm-roberta-large", add_pooling_layer=False
-)
-# $HOME/atd-models/xlm-roberta-large
diff --git a/scripts/run_proxy.sh b/scripts/run_proxy.sh
deleted file mode 100644
index 45acd1c..0000000
--- a/scripts/run_proxy.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-python artificial_detection/data/proxy.py \
-    --df_path="$HOME/atd-data/metrics_checkpoint_merged_df.tsv" \
-    --metrics_names="$1" \
-    --model_path="$2" \
-    --baseline_path="$3" \
-    --output_path="$HOME/atd-data/collected_metrics_$1.tsv"

From 56656385ad2cf99bb9705d48e885efb201add213 Mon Sep 17 00:00:00 2001
From: Marat Saidov <marat.a.saidov@gmail.com>
Date: Mon, 2 Jan 2023 14:03:37 +0700
Subject: [PATCH 2/3] docs refactoring and todos for bad snippets

---
 README.md                                    | 51 +++++++-------------
 artificial_detection/pipelines/supervised.py |  1 +
 docs/data.md                                 | 33 +++++++++++++
 tests/test_translate.py                      |  6 ---
 4 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index ae84d33..8a54d4d 100644
--- a/README.md
+++ b/README.md
@@ -23,58 +23,43 @@ NLP approaches to compare natural text against generated by neural networks.
 
 ### Contents
 
-Project description is put into:
+This work has the extensive documentation which is divided into following parts:
 
-- [Framework Description Markdown](https://github.com/MaratSaidov/artificial-text-detection/blob/main/detection/README.md)
-- [Data Description Markdown](https://github.com/MaratSaidov/artificial-text-detection/blob/main/detection/data/README.md)
-- [Models Description Markdown](https://github.com/MaratSaidov/artificial-text-detection/blob/main/detection/data/README.md)
+- [ATD framework](docs/framework.md). This section is about the general way to detect artificial texts.
+  It consists of the overview of end-to-end pipelines list for this task.
+- [Data](docs/data.md). Textual domains were aggregated in the specific way.
+  This section describes the generative part of the task.
+- [Discriminator models](docs/models.md). This section describes the discriminative part of the task.
 
 ### Installation steps:
 
-We use [`poetry`](https://python-poetry.org/) as an enhanced dependency resolver.
-
+We use [`poetry`](https://python-poetry.org/) as a dependency resolver.
 ```bash
 make poetry-download
 poetry install --no-dev
 ```
 
-### Datasets for artificial text detection
-
-To create datasets for the further classification, it is necessary to collect them.
-There are 2 available ways for it:
-
-- Via [Data Version Control](https://dvc.org/).
-Get in touch with [`@msaidov`](https://t.me/msaidov) in order to have the access to the private Google Drive;
-- Via datasets generation. One dataset with a size of 20,000 samples was process with MT model on V100 GPU for 30 mins;
-
-### Data Version Control usage:
-
-```bash
-poetry add "dvc[gdrive]"
-```
-
-Then, run `dvc pull`. It will download preprocessed translation datasets
-from the Google Drive.
-
-### Datasets generation
-
-To generate translations before artificial text detection pipeline,
-install the `detection` module from the cloned repo or PyPi (TODO):
-```bash
-pip install -e .
-```
-Then, run generate script:
+Make sure that `artificial_detection` library is pre-configured in your environment:
 ```bash
-python detection/data/generate.py --dataset_name='tatoeba' --size=20000 --device='cuda:0'
+pip show artificial-detection
 ```
 
 ### Simple run:
 
 To run the artificial text detection classifier, execute the pipeline:
 
+# TODO: improve runners
 ```bash
 python detection/old.py
 ```
 
+
+### DVC for prototypes
+
+[DVC](https://dvc.org/) is used to pull and update small- and medium-sized samples into remote storage.
+Google Drive has been chosen as a remote storage.
+Credentials are hidden. Reach out the maintainer for contributions and prototyping support.
+
+
 [build_status_badge]: https://github.com/MaratSaidov/artificial-text-detection/actions/workflows/build.yml/badge.svg
 [build_status_link]: https://github.com/MaratSaidov/artificial-text-detection/actions/workflows/build.yml
diff --git a/artificial_detection/pipelines/supervised.py b/artificial_detection/pipelines/supervised.py
index 12944cf..e616aa6 100644
--- a/artificial_detection/pipelines/supervised.py
+++ b/artificial_detection/pipelines/supervised.py
@@ -44,6 +44,7 @@ def read_splits(df, as_datasets):
 
 
 def prepare_data(tokenizer):
+    # TODO: get rid of prefixes which include my root
     data_path = "/home/masaidov/atd-data/metrics_df.tsv"
     df = pd.read_csv(data_path, sep="\t")
     df = df[["text", "label", "subset"]]
diff --git a/docs/data.md b/docs/data.md
index d51771d..e8128a7 100644
--- a/docs/data.md
+++ b/docs/data.md
@@ -31,3 +31,36 @@ dvc push
 ```
 
 Do not forget to commit `data.dvc` file in order to pull processed datasets later.
+
+
+# TODO: refactor later on
+
+### Datasets for artificial text detection
+
+To create datasets for the further classification, it is necessary to collect them.
+There are 2 available ways for it:
+
+- Via [Data Version Control](https://dvc.org/).
+Get in touch with [`@msaidov`](https://t.me/msaidov) in order to have the access to the private Google Drive;
+- Via datasets generation. One dataset with a size of 20,000 samples was process with MT model on V100 GPU for 30 mins;
+
+### Data Version Control usage:
+
+```bash
+poetry add "dvc[gdrive]"
+```
+
+Then, run `dvc pull`. It will download preprocessed translation datasets
+from the Google Drive.
+
+### Datasets generation
+
+To generate translations before artificial text detection pipeline,
+install the `detection` module from the cloned repo or PyPi (TODO):
+```bash
+pip install -e .
+```
+Then, run generate script:
+```bash
+python detection/data/generate.py --dataset_name='tatoeba' --size=20000 --device='cuda:0'
+```
diff --git a/tests/test_translate.py b/tests/test_translate.py
index de5c87f..dd2f7d0 100644
--- a/tests/test_translate.py
+++ b/tests/test_translate.py
@@ -6,7 +6,6 @@
 
 from artificial_detection.models.translation import TranslationModel
 from artificial_detection.utils import get_dataset_path, save_translations_texts
-from tests import skip_github
 
 
 class TestTranslate(TestCase):
@@ -55,8 +54,3 @@ def test_save_to_csv(self) -> None:
         assert_that(df_sample.columns.tolist(), has_items(*["sources", "targets", "translations"]))
         assert_that(len(df_sample), equal_to(5))
         os.remove(dataset_path)
-
-    @skip_github
-    def test_gpu_usage(self) -> None:
-        # TODO
-        pass

From a36e58e442bc9d2c8f80b568ea3154ee37fde87e Mon Sep 17 00:00:00 2001
From: Marat Saidov <marat.a.saidov@gmail.com>
Date: Mon, 2 Jan 2023 20:56:45 +0700
Subject: [PATCH 3/3] templated docs

---
 docs/data.md      | 22 ++++++++++++++++++++--
 docs/framework.md | 16 +++-------------
 docs/models.md    |  4 +++-
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/docs/data.md b/docs/data.md
index e8128a7..2b72087 100644
--- a/docs/data.md
+++ b/docs/data.md
@@ -1,6 +1,8 @@
-# Data generation description
+# Data
+
+A part of the documentation related to datasets and a generative part.
+
 
-Some additional information about datasets.
 
 ### Data naming rules:
 
@@ -64,3 +66,19 @@ Then, run generate script:
 ```bash
 python detection/data/generate.py --dataset_name='tatoeba' --size=20000 --device='cuda:0'
 ```
+
+TODO: refactoring
+
+The purpose of the Detection framework is to generalize artificial text detection approaches.
+
+At the current stage this framework provides the support of:
+
+- Datasets from [tatoeba](https://huggingface.co/datasets/tatoeba),
+[WikiMatrix](https://github.com/facebookresearch/LASER/tree/main/tasks/WikiMatrix);
+- Models from [EasyNMT](https://github.com/UKPLab/EasyNMT).
+
+Here is a list of supported languages:
+
+| Source language  | Target Language | Dataset |
+| :---: | :---: | :---: |
+| Russian | English | Tatoeba |
diff --git a/docs/framework.md b/docs/framework.md
index 07f49de..5ffa560 100644
--- a/docs/framework.md
+++ b/docs/framework.md
@@ -1,15 +1,5 @@
-# Detection framework
+# Artificial Text Detection: Pipelines and General Framework
 
-The purpose of the Detection framework is to generalize artificial text detection approaches.
+Here we describe the pipeline structure and how we dealt with multiple domains.
 
-At the current stage this framework provides the support of:
-
-- Datasets from [tatoeba](https://huggingface.co/datasets/tatoeba),
-[WikiMatrix](https://github.com/facebookresearch/LASER/tree/main/tasks/WikiMatrix);
-- Models from [EasyNMT](https://github.com/UKPLab/EasyNMT).
-
-Here is a list of supported languages:
-
-| Source language  | Target Language | Dataset |
-| :---: | :---: | :---: |
-| Russian | English | Tatoeba |
+TODO
diff --git a/docs/models.md b/docs/models.md
index 19d9ca6..8d0a721 100644
--- a/docs/models.md
+++ b/docs/models.md
@@ -1,4 +1,6 @@
-# Models description
+# Discriminative modeling
+
+TODO: we train discriminators...
 
 Primarly, we use a wrapper on machine translation models from [`EasyNMT`](https://github.com/UKPLab/EasyNMT).