From b4c0a0373d0fb68f50dc32f68661efe84032f91f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 2 Jul 2025 14:58:54 +0200 Subject: [PATCH 1/5] align hffs and hfapi for expand --- src/huggingface_hub/hf_file_system.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 4f438edc05..109dcd4ea1 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -367,7 +367,6 @@ def ls( """ resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() - kwargs = {"expand_info": detail, **kwargs} try: out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs) except EntryNotFoundError: @@ -386,7 +385,7 @@ def _ls_tree( recursive: bool = False, refresh: bool = False, revision: Optional[str] = None, - expand_info: bool = True, + expand_info: bool = False, ): resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() @@ -497,8 +496,6 @@ def walk(self, path: str, *args, **kwargs) -> Iterator[Tuple[str, List[str], Lis Returns: `Iterator[Tuple[str, List[str], List[str]]]`: An iterator of (path, list of directory names, list of file names) tuples. """ - # Set expand_info=False by default to get a x10 speed boost - kwargs = {"expand_info": kwargs.get("detail", False), **kwargs} path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() yield from super().walk(path, *args, **kwargs) @@ -515,8 +512,6 @@ def glob(self, path: str, **kwargs) -> List[str]: Returns: `List[str]`: List of paths matching the pattern. """ - # Set expand_info=False by default to get a x10 speed boost - kwargs = {"expand_info": kwargs.get("detail", False), **kwargs} path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve() return super().glob(path, **kwargs) @@ -558,7 +553,6 @@ def find( ) resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() - kwargs = {"expand_info": detail, **kwargs} try: out = self._ls_tree(path, recursive=True, refresh=refresh, revision=resolved_path.revision, **kwargs) except EntryNotFoundError: @@ -683,7 +677,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, resolved_path = self.resolve_path(path, revision=revision) path = resolved_path.unresolve() expand_info = kwargs.get( - "expand_info", True + "expand_info", False ) # don't expose it as a parameter in the public API to follow the spec if not resolved_path.path_in_repo: # Path is the root directory @@ -708,7 +702,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, parent_path = self._parent(path) if not expand_info and parent_path not in self.dircache: # Fill the cache with cheap call - self.ls(parent_path, expand_info=False) + self.ls(parent_path) if parent_path in self.dircache: # Check if the path is in the cache out1 = [o for o in self.dircache[parent_path] if o["name"] == path] @@ -779,7 +773,7 @@ def exists(self, path, **kwargs): if kwargs.get("refresh", False): self.invalidate_cache(path) - self.info(path, **{**kwargs, "expand_info": False}) + self.info(path, **kwargs) return True except: # noqa: E722 return False @@ -798,7 +792,7 @@ def isdir(self, path): `bool`: True if path is a directory, False otherwise. """ try: - return self.info(path, expand_info=False)["type"] == "directory" + return self.info(path)["type"] == "directory" except OSError: return False @@ -816,7 +810,7 @@ def isfile(self, path): `bool`: True if path is a file, False otherwise. """ try: - return self.info(path, expand_info=False)["type"] == "file" + return self.info(path)["type"] == "file" except: # noqa: E722 return False @@ -942,9 +936,6 @@ def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, f"{e}.\nMake sure the repository and revision exist before writing data." ) from e raise - # avoid an unnecessary .info() call with expensive expand_info=True to instantiate .details - if kwargs.get("mode", "rb") == "rb": - self.details = fs.info(self.resolved_path.unresolve(), expand_info=False) super().__init__(fs, self.resolved_path.unresolve(), **kwargs) self.fs: HfFileSystem From 929e884b509ec94420b1877424756aa27f5f9e57 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 2 Jul 2025 15:07:09 +0200 Subject: [PATCH 2/5] update tests --- tests/test_hf_file_system.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 0c829f5ad6..d4ef98f6ba 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -66,13 +66,13 @@ def test_info(self): self.assertEqual(root_dir["type"], "directory") self.assertEqual(root_dir["size"], 0) self.assertTrue(root_dir["name"].endswith(self.repo_id)) - self.assertIsNotNone(root_dir["last_commit"]) + self.assertIsNone(root_dir["last_commit"]) data_dir = self.hffs.info(self.hf_path + "/data") self.assertEqual(data_dir["type"], "directory") self.assertEqual(data_dir["size"], 0) self.assertTrue(data_dir["name"].endswith("/data")) - self.assertIsNotNone(data_dir["last_commit"]) + self.assertIsNone(data_dir["last_commit"]) self.assertIsNotNone(data_dir["tree_id"]) text_data_file = self.hffs.info(self.text_file) @@ -80,7 +80,7 @@ def test_info(self): self.assertGreater(text_data_file["size"], 0) # not empty self.assertTrue(text_data_file["name"].endswith("/data/text_data.txt")) self.assertIsNone(text_data_file["lfs"]) - self.assertIsNotNone(text_data_file["last_commit"]) + self.assertIsNone(text_data_file["last_commit"]) self.assertIsNotNone(text_data_file["blob_id"]) self.assertIn("security", text_data_file) # the staging endpoint does not run security checks From c8bdc38143f3f6e86cec2c6f87de5a488531d598 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 3 Jul 2025 18:35:39 +0200 Subject: [PATCH 3/5] fix tests --- src/huggingface_hub/hf_file_system.py | 3 ++- tests/test_hf_file_system.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 109dcd4ea1..7c20d6e5eb 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -647,7 +647,7 @@ def modified(self, path: str, **kwargs) -> datetime: Returns: `datetime`: Last commit date of the file. """ - info = self.info(path, **kwargs) + info = self.info(path, expand_info=True, **kwargs) return info["last_commit"]["date"] def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]: @@ -685,6 +685,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, "name": path, "size": 0, "type": "directory", + "last_commit": None, } if expand_info: last_commit = self._api.list_repo_commits( diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index d4ef98f6ba..428022f0ce 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -277,13 +277,13 @@ def test_list_root_directory_no_revision(self): self.assertEqual(files[0]["type"], "directory") self.assertEqual(files[0]["size"], 0) self.assertTrue(files[0]["name"].endswith("/data")) - self.assertIsNotNone(files[0]["last_commit"]) + self.assertIsNone(files[0]["last_commit"]) self.assertIsNotNone(files[0]["tree_id"]) self.assertEqual(files[1]["type"], "file") self.assertGreater(files[1]["size"], 0) # not empty self.assertTrue(files[1]["name"].endswith("/.gitattributes")) - self.assertIsNotNone(files[1]["last_commit"]) + self.assertIsNone(files[1]["last_commit"]) self.assertIsNotNone(files[1]["blob_id"]) self.assertIn("security", files[1]) # the staging endpoint does not run security checks @@ -298,7 +298,7 @@ def test_list_data_directory_no_revision(self): self.assertIn("sha256", files[0]["lfs"]) self.assertIn("size", files[0]["lfs"]) self.assertIn("pointer_size", files[0]["lfs"]) - self.assertIsNotNone(files[0]["last_commit"]) + self.assertIsNone(files[0]["last_commit"]) self.assertIsNotNone(files[0]["blob_id"]) self.assertIn("security", files[0]) # the staging endpoint does not run security checks @@ -306,7 +306,7 @@ def test_list_data_directory_no_revision(self): self.assertGreater(files[1]["size"], 0) # not empty self.assertTrue(files[1]["name"].endswith("/data/text_data.txt")) self.assertIsNone(files[1]["lfs"]) - self.assertIsNotNone(files[1]["last_commit"]) + self.assertIsNone(files[1]["last_commit"]) self.assertIsNotNone(files[1]["blob_id"]) self.assertIn("security", files[1]) # the staging endpoint does not run security checks @@ -318,7 +318,7 @@ def test_list_data_file_no_revision(self): self.assertGreater(files[0]["size"], 0) # not empty self.assertTrue(files[0]["name"].endswith("/data/text_data.txt")) self.assertIsNone(files[0]["lfs"]) - self.assertIsNotNone(files[0]["last_commit"]) + self.assertIsNone(files[0]["last_commit"]) self.assertIsNotNone(files[0]["blob_id"]) self.assertIn("security", files[0]) # the staging endpoint does not run security checks @@ -351,6 +351,11 @@ def test_list_root_directory_no_revision_no_detail_then_with_detail(self): files = self.hffs.ls(self.hf_path, detail=True) self.assertEqual(len(files), 2) self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes")) + self.assertIsNone(self.hffs.dircache[self.hf_path][0]["last_commit"]) # no expand_info -> no last_commit in cache + + files = self.hffs.ls(self.hf_path, detail=True, expand_info=True) + self.assertEqual(len(files), 2) + self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes")) self.assertIsNotNone(self.hffs.dircache[self.hf_path][0]["last_commit"]) def test_find_root_directory_no_revision(self): From b2deadf8063063be5facdee467282bb34a7e5ffc Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 3 Jul 2025 18:46:17 +0200 Subject: [PATCH 4/5] again --- tests/test_hf_file_system.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 428022f0ce..16dd3b5ee1 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -140,7 +140,7 @@ def test_glob(self): self.assertTrue( files[keys[0]]["name"].endswith("/.gitattributes") and files[keys[1]]["name"].endswith("/data") ) - self.assertIsNotNone(files[keys[0]]["last_commit"]) + self.assertIsNone(files[keys[0]]["last_commit"]) def test_url(self): self.assertEqual( @@ -351,7 +351,9 @@ def test_list_root_directory_no_revision_no_detail_then_with_detail(self): files = self.hffs.ls(self.hf_path, detail=True) self.assertEqual(len(files), 2) self.assertTrue(files[0]["name"].endswith("/data") and files[1]["name"].endswith("/.gitattributes")) - self.assertIsNone(self.hffs.dircache[self.hf_path][0]["last_commit"]) # no expand_info -> no last_commit in cache + self.assertIsNone( + self.hffs.dircache[self.hf_path][0]["last_commit"] + ) # no expand_info -> no last_commit in cache files = self.hffs.ls(self.hf_path, detail=True, expand_info=True) self.assertEqual(len(files), 2) From ebe4ae6d8bb1efb8124def394e965fb8b91ffcde Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:59:18 +0200 Subject: [PATCH 5/5] Update src/huggingface_hub/hf_file_system.py Co-authored-by: Lucain --- src/huggingface_hub/hf_file_system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 7c20d6e5eb..050a1d9bf9 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -647,7 +647,7 @@ def modified(self, path: str, **kwargs) -> datetime: Returns: `datetime`: Last commit date of the file. """ - info = self.info(path, expand_info=True, **kwargs) + info = self.info(path, **{**kwargs, "expand_info": True}) return info["last_commit"]["date"] def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]: