Skip to content

Commit 644fd5d

Browse files
authored
Enable xet in push to hub (#7552)
* enable xet in push_to_hub * use binary in hfh upload insteaf of buffer
1 parent e939959 commit 644fd5d

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

src/datasets/arrow_dataset.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5394,8 +5394,10 @@ def shards_with_embedded_external_files(shards: Iterator[Dataset]) -> Iterator[D
53945394
shard_path_in_repo = f"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet"
53955395
buffer = BytesIO()
53965396
shard.to_parquet(buffer)
5397-
uploaded_size += buffer.tell()
5398-
shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
5397+
parquet_content = buffer.getvalue()
5398+
uploaded_size += len(parquet_content)
5399+
del buffer
5400+
shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=parquet_content)
53995401
api.preupload_lfs_files(
54005402
repo_id=repo_id,
54015403
additions=[shard_addition],
@@ -5705,10 +5707,11 @@ def push_to_hub(
57055707
with open(dataset_infos_path, encoding="utf-8") as f:
57065708
dataset_infos: dict = json.load(f)
57075709
dataset_infos[config_name] = asdict(info_to_dump)
5708-
buffer = BytesIO()
5709-
buffer.write(json.dumps(dataset_infos, indent=4).encode("utf-8"))
57105710
additions.append(
5711-
CommitOperationAdd(path_in_repo=config.DATASETDICT_INFOS_FILENAME, path_or_fileobj=buffer)
5711+
CommitOperationAdd(
5712+
path_in_repo=config.DATASETDICT_INFOS_FILENAME,
5713+
path_or_fileobj=json.dumps(dataset_infos, indent=4).encode("utf-8"),
5714+
)
57125715
)
57135716
# push to README
57145717
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)

src/datasets/dataset_dict.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import re
88
from collections.abc import Sequence
99
from functools import partial
10-
from io import BytesIO
1110
from pathlib import Path
1211
from typing import Callable, Optional, Union
1312

@@ -1853,12 +1852,10 @@ def push_to_hub(
18531852
with open(dataset_infos_path, encoding="utf-8") as f:
18541853
dataset_infos: dict = json.load(f)
18551854
dataset_infos[config_name] = asdict(info_to_dump)
1856-
buffer = BytesIO()
1857-
buffer.write(json.dumps(dataset_infos, indent=4).encode("utf-8"))
18581855
additions.append(
18591856
CommitOperationAdd(
18601857
path_in_repo=config.DATASETDICT_INFOS_FILENAME,
1861-
path_or_fileobj=buffer,
1858+
path_or_fileobj=json.dumps(dataset_infos, indent=4).encode("utf-8"),
18621859
)
18631860
)
18641861
# push to README

0 commit comments

Comments
 (0)