Merge pull request #608 from GuoxiaWang/replace_url

ForFishes · web-flow · commit f1c697dc5126 · 2022-08-02T15:59:47.000+08:00
Simple implementation of file download and caching tools
diff --git a/fleetx/data/data_tools/gpt/README.md b/fleetx/data/data_tools/gpt/README.md
@@ -40,7 +40,7 @@
 首先下载样例数据：
 ```
 mkdir data && cd data
-wget https://bj.bcebos.com/paddlefleetx/models/transformers/data_tools/baike.txt
+wget https://fleet.bj.bcebos.com/datasets/gpt/wikitext-103-en.txt
 cd ..
 ```
 ### 原始数据转换 jsonl 格式
@@ -96,7 +96,7 @@ optional arguments:
   -h, --help            show this help message and exit
   --model_name MODEL_NAME
                         What model to use.
-                        必须设置，如：ernie-1.0-base-zh, 可以参考已有的模型名称 https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer
+                        必须设置，如：gpt2
   --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer}
                         What type of tokenizer to use.
                         模型对应的tokenizer, 目前暂时只支持 Ernie，Bert，GPT
@@ -142,7 +142,7 @@ common config:
                         处理文本id化的进程个数。
 ```
 通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`wikitext_103_en.npy`, 文章索引信息`wikitext_103_en.npz`.
-在使用 `GPTTokenizer` 时需要用到 `gpt2-vocab.json` 与 `gpt2-merges.txt`，如果没有下载缓存过这两个文件，脚本会自动下载并缓存。当遇到网络问题时，可以自行下载并将这两个文件放置在 `~/.cache/cached_path/` 目录下。
+在使用 `GPTTokenizer` 时需要用到 `gpt2-vocab.json` 与 `gpt2-merges.txt`，如果没有下载缓存过这两个文件，脚本会自动下载并缓存。当遇到网络问题时，可以自行下载并将这两个文件放置在 `~/.cache/fleetx/` 目录下。
 ``` 
 python -u preprocess_data.py \
     --model_name gpt2 \
diff --git a/fleetx/data/tokenizers/gpt_tokenizer.py b/fleetx/data/tokenizers/gpt_tokenizer.py
@@ -25,6 +25,8 @@
 import regex as re
 from io import open
 
+from fleetx.utils.download import cached_path
+
 try:
     from functools import lru_cache
 except ImportError:
@@ -38,12 +40,10 @@ def lru_cache():
 logger = logging.getLogger(__name__)
 
 PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'gpt2':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+    'gpt2': "https://fleet.bj.bcebos.com/datasets/gpt/gpt2-vocab.json",
 }
 PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'gpt2':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+    'gpt2': "https://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {'gpt2': 1024, }
 VOCAB_NAME = 'vocab.json'
@@ -124,7 +124,6 @@ def from_pretrained(cls,
                     special_tokens_file))
         # redirect to the cache, if necessary
         try:
-            from cached_path import cached_path
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
             resolved_merges_file = cached_path(
                 merges_file, cache_dir=cache_dir)
diff --git a/fleetx/utils/download.py b/fleetx/utils/download.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import requests
+import shutil
+from fleetx.utils import logger
+from tqdm import tqdm
+import paddle
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = os.path.split(url)[-1]
+    fpath = fname
+    return os.path.join(root_dir, fpath)
+
+
+def cached_path(url_or_path, cache_dir=None):
+    if cache_dir is None:
+        cache_dir = '~/.cache/fleetx/'
+
+    cache_dir = os.path.expanduser(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    if is_url(url_or_path):
+        path = _map_path(url_or_path, cache_dir)
+        url = url_or_path
+    else:
+        path = url_or_path
+        url = None
+
+    if os.path.exists(path):
+        logger.info(
+            f"Found {os.path.split(path)[-1]} in cache_dir: {cache_dir}.")
+        return path
+
+    download(url, path)
+    return path
+
+
+def _download(url, fullname):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    retry_cnt = 0
+
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {}".format(url))
+
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info("Downloading {} failed {} times with exception {}".
+                        format(url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def download(url, path):
+    local_rank = 0
+    world_size = 1
+    if paddle.fluid.core.is_compiled_with_dist():
+        local_rank = paddle.distributed.ParallelEnv().dev_id
+        world_size = paddle.distributed.get_world_size()
+    if world_size > 1 and local_rank != 0:
+        while not os.path.exists(path):
+            time.sleep(1)
+    else:
+        _download(url, path)
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 regex
 colorlog
 colorama
-cached_path >= 1.1.5
 inspect
 omegaconf
+tqdm