datajuicer
diff --git a/‎configs/config_all.yaml‎
Lines changed: 9 additions & 3 deletions b/‎configs/config_all.yaml‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎data_juicer/config/config.py‎
Lines changed: 22 additions & 0 deletions b/‎data_juicer/config/config.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎data_juicer/core/analyzer.py‎
Lines changed: 1 addition & 0 deletions b/‎data_juicer/core/analyzer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_juicer/core/data/ray_dataset.py‎
Lines changed: 10 additions & 5 deletions b/‎data_juicer/core/data/ray_dataset.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎data_juicer/core/executor/default_executor.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/core/executor/default_executor.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_juicer/core/executor/ray_executor.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/core/executor/ray_executor.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_juicer/core/exporter.py‎
Lines changed: 9 additions & 9 deletions b/‎data_juicer/core/exporter.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎data_juicer/core/ray_exporter.py‎
Lines changed: 80 additions & 11 deletions b/‎data_juicer/core/ray_exporter.py‎
Lines changed: 80 additions & 11 deletions
@@ -27,8 +27,13 @@ validators:                                                 # validators are a l
       text: 'str'
 
 export_path: '/path/to/result/dataset.jsonl'                # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
+export_type: 'jsonl'                                        # The export format type. If it's not specified, Data-Juicer will parse from the export_path. The supported types can be found in Exporter._router() for standalone mode and RayExporter._SUPPORTED_FORMATS for ray mode
 export_shard_size: 0                                        # shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
 export_in_parallel: false                                   # whether to export the result dataset in parallel to a single file, which usually takes less time. It only works when export_shard_size is 0, and its default number of processes is the same as the argument np. **Notice**: If it's True, sometimes exporting in parallel might require much more time due to the IO blocking, especially for very large datasets. When this happens, False is a better choice, although it takes more time.
+keep_stats_in_res_ds: false                                 # whether to keep the computed stats in the result dataset. The intermediate fields to store the stats computed by Filters will be removed if it's False. It's False in default.
+keep_hashes_in_res_ds: false                                # whether to keep the computed hashes in the result dataset. The intermediate fields to store the hashes computed by Deduplicators will be removed if it's False. It's False in default.
+export_extra_args: {}                                       # Other optional arguments for exporting in dict. For example, the key mapping info for exporting the WebDataset format.
+
 np: 4                                                       # number of subprocess to process your dataset
 text_keys: 'text'                                           # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
                                                             # Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
@@ -46,12 +51,11 @@ trace_num: 10                                               # number of samples
 op_fusion: false                                            # whether to fuse operators that share the same intermediate variables automatically. Op fusion might reduce the memory requirements slightly but speed up the whole process.
 fusion_strategy: 'probe'                                    # OP fusion strategy. Support ['greedy', 'probe'] now. 'greedy' means keep the basic OP order and put the fused OP to the last of each fused OP group. 'probe' means Data-Juicer will probe the running speed for each OP at the beginning and reorder the OPs and fused OPs according to their probed speed (fast to slow). It's 'probe' in default.
 cache_compress: null                                        # the compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.
-keep_stats_in_res_ds: false                                 # whether to keep the computed stats in the result dataset. The intermediate fields to store the stats computed by Filters will be removed if it's False. It's False in default.
-keep_hashes_in_res_ds: false                                # whether to keep the computed hashes in the result dataset. The intermediate fields to store the hashes computed by Deduplicators will be removed if it's False. It's False in default.
 adaptive_batch_size: false                                  # whether to use adaptive batch sizes for each OP according to the probed results. It's False in default.
 
 # for multimodal data processing
 image_key: 'images'                                         # key name of field to store the list of sample image paths.
+image_bytes_key: 'image_bytes'                              # key name of field to store the list of sample image bytes.
 image_special_token: '<__dj__image>'                        # the special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
 audio_key: 'audios'                                         # key name of field to store the list of sample audio paths.
 audio_special_token: '<__dj__audio>'                        # the special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
@@ -273,10 +277,12 @@ process:
   - extract_tables_from_html_mapper:                        # extract tables from HTML content
       tables_field_name: 'html_tables'                        # Field name to store the extracted tables.
       retain_html_tags: false,                                # If True, retains HTML tags in the tables; otherwise, removes them.
-      include_header: true,                                   # If True, includes the table header; otherwise, excludes it. This parameter is effective only when `retain_html_tags` is False and applies solely to the extracted table content.
+      include_header: true                                    # If True, includes the table header; otherwise, excludes it. This parameter is effective only when `retain_html_tags` is False and applies solely to the extracted table content.
   - download_file_mapper:                                   # download url files to local files
       save_dir: null                                          # The directory to save downloaded files.
       download_field: null                                    # The filed name to get the url to download.
+      save_field: null                                        # The filed name to save the downloaded file content.
+      resume_download: false                                  # Whether to resume download. if True, skip the sample if it exists.
       timeout: 30                                             # The timeout in seconds for each HTTP request.
       max_concurrent: 10                                      # Maximum concurrent downloads.
   - fix_unicode_mapper:                                     # fix unicode errors in text.
 
@@ -167,6 +167,14 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None, l
                 "directory to store the processed dataset will be the work "
                 "directory of this process.",
             )
+            parser.add_argument(
+                "--export_type",
+                type=str,
+                default=None,
+                help="The export format type. If it's not specified, Data-Juicer will parse from the export_path. The "
+                "supported types can be found in Exporter._router() for standalone mode and "
+                "RayExporter._SUPPORTED_FORMATS for ray mode",
+            )
             parser.add_argument(
                 "--export_shard_size",
                 type=NonNegativeInt,
@@ -190,6 +198,13 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None, l
                 "When this happens, False is a better choice, although it takes "
                 "more time.",
             )
+            parser.add_argument(
+                "--export_extra_args",
+                type=Dict,
+                default={},
+                help="Other optional arguments for exporting in dict. For example, the key mapping info for exporting "
+                "the WebDataset format.",
+            )
             parser.add_argument(
                 "--keep_stats_in_res_ds",
                 type=bool,
@@ -224,6 +239,12 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None, l
                 default="images",
                 help="Key name of field to store the list of sample image paths.",  # noqa: E251
             )
+            parser.add_argument(
+                "--image_bytes_key",
+                type=str,
+                default="image_bytes",
+                help="Key name of field to store the list of sample image bytes.",  # noqa: E251
+            )
             parser.add_argument(
                 "--image_special_token",
                 type=str,
@@ -667,6 +688,7 @@ def init_setup_from_cfg(cfg: Namespace, load_configs_only=False):
         "image_key": cfg.get("image_key", "images"),
         "audio_key": cfg.get("audio_key", "audios"),
         "video_key": cfg.get("video_key", "videos"),
+        "image_bytes_key": cfg.get("image_bytes_key", "image_bytes"),
         "num_proc": cfg.np,
         "turbo": cfg.get("turbo", False),
         "skip_op_error": cfg.get("skip_op_error", True),
 
@@ -58,6 +58,7 @@ def __init__(self, cfg: Optional[Namespace] = None):
         logger.info("Preparing exporter...")
         self.exporter = Exporter(
             self.cfg.export_path,
+            self.cfg.export_type,
             self.cfg.export_shard_size,
             self.cfg.export_in_parallel,
             self.cfg.np,
 
@@ -16,8 +16,8 @@
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import is_remote_path
 from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import SpecialTokens
 from data_juicer.utils.process_utils import calculate_np
+from data_juicer.utils.webdataset_utils import _custom_default_decoder
 
 ray = LazyLoader("ray")
 
@@ -53,9 +53,9 @@ def set_dataset_to_absolute_path(dataset, dataset_path, cfg):
     path_keys = []
     columns = dataset.columns()
     for key in [
-        cfg.get("video_key", SpecialTokens.video),
-        cfg.get("image_key", SpecialTokens.image),
-        cfg.get("audio_key", SpecialTokens.audio),
+        cfg.get("video_key", "videos"),
+        cfg.get("image_key", "images"),
+        cfg.get("audio_key", "audios"),
     ]:
         if key in columns:
             path_keys.append(key)
@@ -239,6 +239,8 @@ def process_batch_arrow(table: pyarrow.Table):
     def read(cls, data_format: str, paths: Union[str, List[str]]) -> RayDataset:
         if data_format in {"json", "jsonl"}:
             return RayDataset.read_json(paths)
+        elif data_format == "webdataset":
+            return RayDataset.read_webdataset(paths)
         elif data_format in {
             "parquet",
             "images",
@@ -248,7 +250,6 @@ def read(cls, data_format: str, paths: Union[str, List[str]]) -> RayDataset:
             "avro",
             "numpy",
             "tfrecords",
-            "webdataset",
             "binary_files",
             "lance",
         }:
@@ -266,6 +267,10 @@ def read_json(cls, paths: Union[str, List[str]]) -> RayDataset:
         except AttributeError:
             return ray.data.read_json(paths)
 
+    @classmethod
+    def read_webdataset(cls, paths: Union[str, List[str]]) -> RayDataset:
+        return ray.data.read_webdataset(paths, decoder=partial(_custom_default_decoder, format="PIL"))
+
     def to_list(self) -> list:
         return self.data.to_pandas().to_dict(orient="records")
 
 
@@ -72,11 +72,13 @@ def __init__(self, cfg: Optional[Namespace] = None):
         logger.info("Preparing exporter...")
         self.exporter = Exporter(
             self.cfg.export_path,
+            self.cfg.export_type,
             self.cfg.export_shard_size,
             self.cfg.export_in_parallel,
             self.cfg.np,
             keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds,
             keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds,
+            **self.cfg.export_extra_args,
         )
 
         # setup tracer
 
@@ -68,8 +68,10 @@ def __init__(self, cfg: Optional[Namespace] = None):
         logger.info("Preparing exporter...")
         self.exporter = RayExporter(
             self.cfg.export_path,
+            self.cfg.export_type,
             keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds,
             keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds,
+            **self.cfg.export_extra_args,
         )
 
     def run(self, load_data_np: Optional[PositiveInt] = None, skip_return=False):
 
@@ -18,13 +18,15 @@ class Exporter:
     def __init__(
         self,
         export_path,
+        export_type=None,
         export_shard_size=0,
         export_in_parallel=True,
         num_proc=1,
         export_ds=True,
         keep_stats_in_res_ds=False,
         keep_hashes_in_res_ds=False,
         export_stats=True,
+        **kwargs,
     ):
         """
         Initialization method.
@@ -48,7 +50,13 @@ def __init__(
         self.keep_stats_in_res_ds = keep_stats_in_res_ds
         self.keep_hashes_in_res_ds = keep_hashes_in_res_ds
         self.export_stats = export_stats
-        self.suffix = self._get_suffix(export_path)
+        self.suffix = self._get_suffix(export_path) if export_type is None else export_type
+        support_dict = self._router()
+        if self.suffix not in support_dict:
+            raise NotImplementedError(
+                f"Suffix of export path [{export_path}] or specified export_type [{export_type}] is not supported "
+                f"for now. Only support {list(support_dict.keys())}."
+            )
         self.num_proc = num_proc
         self.max_shard_size_str = ""
 
@@ -90,14 +98,6 @@ def _get_suffix(self, export_path):
         :return: the suffix of export_path.
         """
         suffix = export_path.split(".")[-1].lower()
-        support_dict = self._router()
-        if suffix not in support_dict:
-            raise NotImplementedError(
-                f"Suffix of export path ["
-                f"{export_path}] is not supported "
-                f"for now. Only support "
-                f"{list(support_dict.keys())}."
-            )
         return suffix
 
     def _export_impl(self, dataset, export_path, suffix, export_stats=True):
 
@@ -1,8 +1,10 @@
 import os
+from functools import partial
 
 from loguru import logger
 
 from data_juicer.utils.constant import Fields, HashKeys
+from data_juicer.utils.webdataset_utils import reconstruct_custom_webdataset_format
 
 
 class RayExporter:
@@ -22,7 +24,7 @@ class RayExporter:
         # 'numpy',
     }
 
-    def __init__(self, export_path, keep_stats_in_res_ds=True, keep_hashes_in_res_ds=False):
+    def __init__(self, export_path, export_type=None, keep_stats_in_res_ds=True, keep_hashes_in_res_ds=False, **kwargs):
         """
         Initialization method.
 
@@ -35,7 +37,13 @@ def __init__(self, export_path, keep_stats_in_res_ds=True, keep_hashes_in_res_ds
         self.export_path = export_path
         self.keep_stats_in_res_ds = keep_stats_in_res_ds
         self.keep_hashes_in_res_ds = keep_hashes_in_res_ds
-        self.export_format = self._get_export_format(export_path)
+        self.export_format = self._get_export_format(export_path) if export_type is None else export_type
+        if self.export_format not in self._SUPPORTED_FORMATS:
+            raise NotImplementedError(
+                f'export data format "{self.export_format}" is not supported '
+                f"for now. Only support {self._SUPPORTED_FORMATS}. Please check export_type or export_path."
+            )
+        self.export_extra_args = kwargs if kwargs is not None else {}
 
     def _get_export_format(self, export_path):
         """
@@ -54,11 +62,6 @@ def _get_export_format(self, export_path):
             suffix = "jsonl"
 
         export_format = suffix
-        if export_format not in self._SUPPORTED_FORMATS:
-            raise NotImplementedError(
-                f'export data format "{export_format}" is not supported '
-                f"for now. Only support {self._SUPPORTED_FORMATS}."
-            )
         return export_format
 
     def _export_impl(self, dataset, export_path, columns=None):
@@ -88,10 +91,12 @@ def _export_impl(self, dataset, export_path, columns=None):
         if len(removed_fields):
             dataset = dataset.drop_columns(removed_fields)
 
-        if self.export_format in {"json", "jsonl"}:
-            return dataset.write_json(export_path, force_ascii=False)
-        else:
-            return getattr(dataset, f"write_{self.export_format}")(export_path)
+        export_method = RayExporter._router()[self.export_format]
+        export_kwargs = {
+            "export_extra_args": self.export_extra_args,
+            "export_format": self.export_format,
+        }
+        return export_method(dataset, export_path, **export_kwargs)
 
     def export(self, dataset, columns=None):
         """
@@ -102,3 +107,67 @@ def export(self, dataset, columns=None):
         :return:
         """
         self._export_impl(dataset, self.export_path, columns)
+
+    @staticmethod
+    def write_json(dataset, export_path, **kwargs):
+        """
+        Export method for json/jsonl target files.
+
+        :param dataset: the dataset to export.
+        :param export_path: the path to store the exported dataset.
+        :param kwargs: extra arguments.
+        :return:
+        """
+        return dataset.write_json(export_path, force_ascii=False)
+
+    @staticmethod
+    def write_webdataset(dataset, export_path, **kwargs):
+        """
+        Export method for webdataset target files.
+
+        :param dataset: the dataset to export.
+        :param export_path: the path to store the exported dataset.
+        :param kwargs: extra arguments.
+        :return:
+        """
+        from data_juicer.utils.webdataset_utils import _custom_default_encoder
+
+        # check if we need to reconstruct the customized WebDataset format
+        export_extra_args = kwargs.get("export_extra_args", {})
+        field_mapping = export_extra_args.get("field_mapping", {})
+        if len(field_mapping) > 0:
+            reconstruct_func = partial(reconstruct_custom_webdataset_format, field_mapping=field_mapping)
+            dataset = dataset.map(reconstruct_func)
+
+        return dataset.write_webdataset(export_path, encoder=_custom_default_encoder)
+
+    @staticmethod
+    def write_others(dataset, export_path, **kwargs):
+        """
+        Export method for other target files.
+
+        :param dataset: the dataset to export.
+        :param export_path: the path to store the exported dataset.
+        :param kwargs: extra arguments.
+        :return:
+        """
+        export_format = kwargs.get("export_format", "parquet")
+        return getattr(dataset, f"write_{export_format}")(export_path)
+
+    # suffix to export method
+    @staticmethod
+    def _router():
+        """
+        A router from different suffixes to corresponding export methods.
+
+        :return: A dict router.
+        """
+        return {
+            "jsonl": RayExporter.write_json,
+            "json": RayExporter.write_json,
+            "webdataset": RayExporter.write_webdataset,
+            "parquet": RayExporter.write_others,
+            "csv": RayExporter.write_others,
+            "tfrecords": RayExporter.write_others,
+            "lance": RayExporter.write_others,
+        }