From bef5fdc7644093eb04adfda734c8eebd44c9e2fa Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:17:33 +0100 Subject: [PATCH 01/23] fix: parallel file validation for a datapackage --- frictionless/validator/validator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frictionless/validator/validator.py b/frictionless/validator/validator.py index 523143cdbc..7362b61677 100644 --- a/frictionless/validator/validator.py +++ b/frictionless/validator/validator.py @@ -200,6 +200,8 @@ def validate_resource( def validate_parallel(options: types.IDescriptor) -> types.IDescriptor: + from ..resource import Resource # Local import avoids circular import + resource_options = options["resource"] validate_options = options["validate"] resource = Resource.from_descriptor(**resource_options) From b0554a26b78d911d21e3720ad78440175ba2a289 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 11:35:07 +0100 Subject: [PATCH 02/23] =?UTF-8?q?=F0=9F=94=B5=20rename=20local=20variable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/validator/validator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frictionless/validator/validator.py b/frictionless/validator/validator.py index 7362b61677..e43b659b24 100644 --- a/frictionless/validator/validator.py +++ b/frictionless/validator/validator.py @@ -33,7 +33,9 @@ def validate_package( timer = helpers.Timer() reports: List[Report] = [] resources = package.resources if name is None else [package.get_resource(name)] - with_fks = any(res.schema and res.schema.foreign_keys for res in resources) + with_foreign_keys = any( + res.schema and res.schema.foreign_keys for res in resources + ) # Prepare checklist checklist = checklist or Checklist() @@ -45,7 +47,7 @@ def validate_package( return Report.from_validation(time=timer.time, errors=exception.to_errors()) # Validate sequential - if not parallel or with_fks: + if not parallel or with_foreign_keys: for resource in resources: report = resource.validate( checklist=checklist, From f20aab1a96c9914a75ef7009e72225f4c095e338 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 11:43:23 +0100 Subject: [PATCH 03/23] =?UTF-8?q?=F0=9F=94=B5=20improve=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/package/package.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frictionless/package/package.py b/frictionless/package/package.py index bf5e180c7f..7dcfd7df28 100644 --- a/frictionless/package/package.py +++ b/frictionless/package/package.py @@ -476,7 +476,8 @@ def validate( Parameters: checklist? (checklist): a Checklist object - parallel? (bool): run in parallel if possible + parallel? (bool): run in parallel if possible. Parallel execution + is not possible if foreign keys are used in a resource schema. Returns: Report: validation report From bc27f9bec8da43ae3df482a8e3d6da703f5265a6 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 13:45:06 +0100 Subject: [PATCH 04/23] Mv `validate` methods & soft deprecate Validator --- frictionless/__init__.py | 2 + frictionless/package/package.py | 69 +++++++-- frictionless/resource/resource.py | 114 ++++++++++++++- frictionless/resources/table.py | 25 +--- frictionless/validator/validator.py | 214 +++------------------------- 5 files changed, 190 insertions(+), 234 deletions(-) diff --git a/frictionless/__init__.py b/frictionless/__init__.py index 163a0df895..a086d97455 100644 --- a/frictionless/__init__.py +++ b/frictionless/__init__.py @@ -40,4 +40,6 @@ from .table import Lookup as Lookup from .table import Row as Row from .transformer import Transformer as Transformer + +# Deprecated from .validator import Validator as Validator diff --git a/frictionless/package/package.py b/frictionless/package/package.py index 7dcfd7df28..4fdd2c555d 100644 --- a/frictionless/package/package.py +++ b/frictionless/package/package.py @@ -1,24 +1,25 @@ from __future__ import annotations +from multiprocessing import Pool from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union import attrs from typing_extensions import Self from .. import errors, fields, helpers, settings +from ..checklist import Checklist from ..exception import FrictionlessException from ..metadata import Metadata from ..platform import platform +from ..report import Report from ..resource import Resource from ..system import system from ..transformer import Transformer -from ..validator import Validator from .factory import Factory if TYPE_CHECKING: from .. import types from ..catalog import Dataset - from ..checklist import Checklist from ..detector import Detector from ..dialect import Control, Dialect from ..indexer import IOnProgress, IOnRow @@ -483,14 +484,54 @@ def validate( Report: validation report """ - validator = Validator() - return validator.validate_package( - self, - checklist=checklist, - name=name, - parallel=parallel, - limit_rows=limit_rows, - limit_errors=limit_errors, + # Create state + timer = helpers.Timer() + reports: List[Report] = [] + resources = self.resources if name is None else [self.get_resource(name)] + with_foreign_keys = any( + res.schema and res.schema.foreign_keys for res in resources + ) + + # Prepare checklist + checklist = checklist or Checklist() + + # Validate metadata + try: + self.to_descriptor(validate=True) + except FrictionlessException as exception: + return Report.from_validation(time=timer.time, errors=exception.to_errors()) + + # Validate sequential + if not parallel or with_foreign_keys: + for resource in resources: + report = resource.validate( + checklist=checklist, + limit_errors=limit_errors, + limit_rows=limit_rows, + ) + reports.append(report) + + # Validate parallel + else: + with Pool() as pool: + options_pool: List[Dict[str, Any]] = [] + for resource in resources: + options: Any = {} + options["resource"] = {} + options["resource"]["descriptor"] = resource.to_descriptor() + options["resource"]["basepath"] = resource.basepath + options["validate"] = {} + options["validate"]["limit_rows"] = limit_rows + options["validate"]["limit_errors"] = limit_errors + options_pool.append(options) + report_descriptors = pool.map(_validate_parallel, options_pool) + for report_descriptor in report_descriptors: + reports.append(Report.from_descriptor(report_descriptor)) + + # Return report + return Report.from_validation_reports( + time=timer.time, + reports=reports, ) # Convert @@ -707,3 +748,11 @@ def metadata_export(self): # type: ignore # descriptor = {"$frictionless": "package/v2", **descriptor} return descriptor + + +def _validate_parallel(options: types.IDescriptor) -> types.IDescriptor: + resource_options = options["resource"] + validate_options = options["validate"] + resource = Resource.from_descriptor(**resource_options) + report = resource.validate(**validate_options) + return report.to_descriptor() diff --git a/frictionless/resource/resource.py b/frictionless/resource/resource.py index 3d93d85c95..dfff73ffab 100644 --- a/frictionless/resource/resource.py +++ b/frictionless/resource/resource.py @@ -8,22 +8,22 @@ from typing_extensions import Self from .. import errors, fields, helpers, settings +from ..checklist import Checklist from ..detector import Detector from ..dialect import Control, Dialect from ..exception import FrictionlessException from ..metadata import Metadata from ..platform import platform +from ..report import Report from ..schema import Schema from ..system import system -from ..validator import Validator from .factory import Factory from .stats import ResourceStats if TYPE_CHECKING: from .. import types - from ..checklist import Checklist + from ..error import Error from ..package import Package - from ..report import Report from ..system import Loader @@ -619,8 +619,112 @@ def validate( Report: validation report """ - validator = Validator() - return validator.validate_resource(self, checklist=checklist) + # Create state + partial = False + timer = helpers.Timer() + labels: List[str] = [] + errors: List[Error] = [] + warnings: List[str] = [] + + # Prepare checklist + checklist = checklist or Checklist() + checks = checklist.connect(self) + + # Validate metadata + try: + self.to_descriptor(validate=True) + except FrictionlessException as exception: + return Report.from_validation_task( + self, time=timer.time, errors=exception.to_errors() + ) + + # TODO: remove in next version + # Ignore not-supported hashings + if self.hash: + algorithm, _ = helpers.parse_resource_hash_v1(self.hash) + if algorithm not in ["md5", "sha256"]: + warning = "hash is ignored; supported algorithms: md5/sha256" + warnings.append(warning) + + # Prepare resource + if self.closed: + try: + self.open() + except FrictionlessException as exception: + self.close() + return Report.from_validation_task( + self, time=timer.time, errors=exception.to_errors() + ) + + # Validate data + with self: + # Validate start + for index, check in enumerate(checks): + for error in check.validate_start(): + if error.type == "check-error": + del checks[index] + if checklist.match(error): + errors.append(error) + + # Validate file + if not isinstance(self, platform.frictionless_resources.TableResource): + if self.hash is not None or self.bytes is not None: + helpers.pass_through(self.byte_stream) + + # Validate table + else: + row_count = 0 + labels = self.labels + while True: + row_count += 1 + + # Emit row + try: + row = next(self.row_stream) # type: ignore + except FrictionlessException as exception: + errors.append(exception.error) + continue + except StopIteration: + break + + # Validate row + for check in checks: + for error in check.validate_row(row): + if checklist.match(error): + errors.append(error) + + # Callback row + if on_row: + on_row(row) + + # Limit rows + if limit_rows: + if row_count >= limit_rows: + warning = f"reached row limit: {limit_rows}" + warnings.append(warning) + partial = True + break + + # Limit errors + if limit_errors: + if len(errors) >= limit_errors: + errors = errors[:limit_errors] + warning = f"reached error limit: {limit_errors}" + warnings.append(warning) + partial = True + break + + # Validate end + if not partial: + for check in checks: + for error in check.validate_end(): + if checklist.match(error): + errors.append(error) + + # Return report + return Report.from_validation_task( + self, time=timer.time, labels=labels, errors=errors, warnings=warnings + ) # Export diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index fdf9ffce4b..5891ace0b9 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -7,7 +7,7 @@ from frictionless.schema.field import Field -from .. import errors, helpers, settings +from .. import errors, helpers from ..analyzer import Analyzer from ..dialect import Dialect from ..exception import FrictionlessException @@ -17,11 +17,9 @@ from ..system import system from ..table import Header, Lookup, Row, Table from ..transformer import Transformer -from ..validator import Validator if TYPE_CHECKING: from .. import types - from ..checklist import Checklist from ..indexer import IOnProgress, IOnRow from ..pipeline import Pipeline from ..system import Loader, Parser @@ -626,27 +624,6 @@ def transform(self, pipeline: Pipeline): transformer = Transformer() return transformer.transform_table_resource(self, pipeline) - # Validate - - def validate( - self, - checklist: Optional[Checklist] = None, - *, - name: Optional[str] = None, - on_row: Optional[types.ICallbackFunction] = None, - parallel: bool = False, - limit_rows: Optional[int] = None, - limit_errors: int = settings.DEFAULT_LIMIT_ERRORS, - ): - validator = Validator() - return validator.validate_resource( - self, - checklist=checklist, - on_row=on_row, - limit_rows=limit_rows, - limit_errors=limit_errors, - ) - # Export def to_view(self, type: str = "look", **options: Any): diff --git a/frictionless/validator/validator.py b/frictionless/validator/validator.py index e43b659b24..da11c94437 100644 --- a/frictionless/validator/validator.py +++ b/frictionless/validator/validator.py @@ -1,211 +1,35 @@ from __future__ import annotations -from multiprocessing import Pool -from typing import TYPE_CHECKING, Any, Dict, List, Optional - -from .. import helpers, settings -from ..checklist import Checklist -from ..exception import FrictionlessException -from ..platform import platform -from ..report import Report +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: - from .. import types - from ..error import Error from ..package import Package from ..resource import Resource class Validator: - # Package - - def validate_package( - self, - package: Package, - *, - checklist: Optional[Checklist] = None, - name: Optional[str] = None, - parallel: bool = False, - limit_rows: Optional[int] = None, - limit_errors: int = settings.DEFAULT_LIMIT_ERRORS, - ): - # Create state - timer = helpers.Timer() - reports: List[Report] = [] - resources = package.resources if name is None else [package.get_resource(name)] - with_foreign_keys = any( - res.schema and res.schema.foreign_keys for res in resources - ) - - # Prepare checklist - checklist = checklist or Checklist() + """ + Validator is deprecated, and only kept for backwards compatibility. - # Validate metadata - try: - package.to_descriptor(validate=True) - except FrictionlessException as exception: - return Report.from_validation(time=timer.time, errors=exception.to_errors()) + There is no plan to remove this class in future versions. + """ - # Validate sequential - if not parallel or with_foreign_keys: - for resource in resources: - report = resource.validate( - checklist=checklist, - limit_errors=limit_errors, - limit_rows=limit_rows, - ) - reports.append(report) + def validate_package(self, package: Package, *args: Any, **kwargs: Any): + """ + Validator.validate_package is deprecated, use and see Package.validate + instead. - # Validate parallel - else: - with Pool() as pool: - options_pool: List[Dict[str, Any]] = [] - for resource in resources: - options = {} - options["resource"] = {} - options["resource"]["descriptor"] = resource.to_descriptor() - options["resource"]["basepath"] = resource.basepath - options["validate"] = {} - options["validate"]["limit_rows"] = limit_rows - options["validate"]["limit_errors"] = limit_errors - options_pool.append(options) - report_descriptors = pool.map(validate_parallel, options_pool) - for report_descriptor in report_descriptors: - reports.append(Report.from_descriptor(report_descriptor)) - - # Return report - return Report.from_validation_reports( - time=timer.time, - reports=reports, - ) + There is no plan to remove this method in future versions. + """ + package.validate(*args, **kwargs) # Resource - def validate_resource( - self, - resource: Resource, - *, - checklist: Optional[Checklist] = None, - limit_errors: int = settings.DEFAULT_LIMIT_ERRORS, - limit_rows: Optional[int] = None, - on_row: Optional[types.ICallbackFunction] = None, - ): - # Create state - partial = False - timer = helpers.Timer() - labels: List[str] = [] - errors: List[Error] = [] - warnings: List[str] = [] - - # Prepare checklist - checklist = checklist or Checklist() - checks = checklist.connect(resource) - - # Validate metadata - try: - resource.to_descriptor(validate=True) - except FrictionlessException as exception: - return Report.from_validation_task( - resource, time=timer.time, errors=exception.to_errors() - ) - - # TODO: remove in next version - # Ignore not-supported hashings - if resource.hash: - algorithm, _ = helpers.parse_resource_hash_v1(resource.hash) - if algorithm not in ["md5", "sha256"]: - warning = "hash is ignored; supported algorithms: md5/sha256" - warnings.append(warning) - - # Prepare resource - if resource.closed: - try: - resource.open() - except FrictionlessException as exception: - resource.close() - return Report.from_validation_task( - resource, time=timer.time, errors=exception.to_errors() - ) - - # Validate data - with resource: - # Validate start - for index, check in enumerate(checks): - for error in check.validate_start(): - if error.type == "check-error": - del checks[index] - if checklist.match(error): - errors.append(error) - - # Validate file - if not isinstance(resource, platform.frictionless_resources.TableResource): - if resource.hash is not None or resource.bytes is not None: - helpers.pass_through(resource.byte_stream) - - # Validate table - else: - row_count = 0 - labels = resource.labels - while True: - row_count += 1 - - # Emit row - try: - row = next(resource.row_stream) # type: ignore - except FrictionlessException as exception: - errors.append(exception.error) - continue - except StopIteration: - break - - # Validate row - for check in checks: - for error in check.validate_row(row): - if checklist.match(error): - errors.append(error) - - # Callback row - if on_row: - on_row(row) - - # Limit rows - if limit_rows: - if row_count >= limit_rows: - warning = f"reached row limit: {limit_rows}" - warnings.append(warning) - partial = True - break - - # Limit errors - if limit_errors: - if len(errors) >= limit_errors: - errors = errors[:limit_errors] - warning = f"reached error limit: {limit_errors}" - warnings.append(warning) - partial = True - break - - # Validate end - if not partial: - for check in checks: - for error in check.validate_end(): - if checklist.match(error): - errors.append(error) - - # Return report - return Report.from_validation_task( - resource, time=timer.time, labels=labels, errors=errors, warnings=warnings - ) - - -# Internal - - -def validate_parallel(options: types.IDescriptor) -> types.IDescriptor: - from ..resource import Resource # Local import avoids circular import + def validate_resource(self, resource: Resource, *args: Any, **kwargs: Any): + """ + Validator.validate_resource is deprecated, use and see Resource.validate + instead. - resource_options = options["resource"] - validate_options = options["validate"] - resource = Resource.from_descriptor(**resource_options) - report = resource.validate(**validate_options) - return report.to_descriptor() + There is no plan to remove this method in future versions. + """ + resource.validate(*args, **kwargs) From 0498e9b319fd4c835c89f85569f5c6c5eb4405a5 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 14:15:54 +0100 Subject: [PATCH 05/23] fix: repair tests of parallel validation --- data/invalid/datapackage_no_foreign_key.json | 49 +++++++++++++++++++ .../__spec__/package/test_parallel.py | 11 +++-- 2 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 data/invalid/datapackage_no_foreign_key.json diff --git a/data/invalid/datapackage_no_foreign_key.json b/data/invalid/datapackage_no_foreign_key.json new file mode 100644 index 0000000000..67be839f6d --- /dev/null +++ b/data/invalid/datapackage_no_foreign_key.json @@ -0,0 +1,49 @@ +{ + "name": "testing", + "resources": [ + { + "name": "data", + "path": "data.csv", + "schema": { + "fields": [ + { + "name": "id", + "type": "string", + "constraints": { + "required": true + } + }, + { + "name": "name", + "type": "string" + }, + { + "name": "description", + "type": "string" + }, + { + "name": "amount", + "type": "number" + } + ], + "primaryKey": "id" + } + }, + { + "name": "data2", + "path": "data2.csv", + "schema": { + "fields": [ + { + "type": "string", + "name": "parent" + }, + { + "type": "string", + "name": "comment" + } + ] + } + } + ] +} diff --git a/frictionless/validator/__spec__/package/test_parallel.py b/frictionless/validator/__spec__/package/test_parallel.py index a54673a69d..0ee0e14b1e 100644 --- a/frictionless/validator/__spec__/package/test_parallel.py +++ b/frictionless/validator/__spec__/package/test_parallel.py @@ -6,18 +6,21 @@ # General +# Note: to test parallel validation, do not use foreign keys to prevent an +# automatic fallback on single-core execution + @pytest.mark.ci def test_package_validate_parallel_from_dict(): - with open("data/package/datapackage.json") as file: - package = Package(json.load(file), basepath="data/package") + with open("data/datapackage.json") as file: + package = Package(json.load(file), basepath="data") report = package.validate(parallel=True) assert report.valid @pytest.mark.ci def test_package_validate_parallel_from_dict_invalid(): - with open("data/invalid/datapackage.json") as file: + with open("data/invalid/datapackage_no_foreign_key.json") as file: package = Package(json.load(file), basepath="data/invalid") report = package.validate(parallel=True) assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ @@ -29,7 +32,7 @@ def test_package_validate_parallel_from_dict_invalid(): @pytest.mark.ci def test_package_validate_with_parallel(): - package = Package("data/invalid/datapackage.json") + package = Package("data/invalid/datapackage_no_foreign_key.json") report = package.validate(parallel=True) assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ [1, 3, None, "blank-row"], From e2f2a33de8f021cd50e3bae363d81d7f9113a202 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 15:12:52 +0100 Subject: [PATCH 06/23] Dispatching tests according to method change --- .../__spec__/test_validate.py} | 317 +++++++++++++++++- .../__spec__/test_validate.py} | 308 +++++++++++++++++ .../__spec__/test_validate_dialect.py} | 2 +- .../__spec__/test_validate_schema.py} | 0 frictionless/validator/__spec__/__init__.py | 0 .../validator/__spec__/package/__init__.py | 0 .../__spec__/package/test_checklist.py | 23 -- .../__spec__/package/test_parallel.py | 41 --- .../validator/__spec__/package/test_schema.py | 179 ---------- .../validator/__spec__/package/test_stats.py | 79 ----- .../validator/__spec__/resource/__init__.py | 0 .../__spec__/resource/test_checklist.py | 14 - .../__spec__/resource/test_compression.py | 23 -- .../__spec__/resource/test_detector.py | 130 ------- .../__spec__/resource/test_encoding.py | 25 -- .../validator/__spec__/resource/test_file.py | 16 - .../__spec__/resource/test_format.py | 9 - .../__spec__/resource/test_scheme.py | 17 - .../validator/__spec__/resource/test_stats.py | 66 ---- 19 files changed, 625 insertions(+), 624 deletions(-) rename frictionless/{validator/__spec__/package/test_general.py => package/__spec__/test_validate.py} (52%) rename frictionless/{validator/__spec__/resource/test_general.py => resource/__spec__/test_validate.py} (61%) rename frictionless/{validator/__spec__/resource/test_dialect.py => resource/__spec__/test_validate_dialect.py} (99%) rename frictionless/{validator/__spec__/resource/test_schema.py => resource/__spec__/test_validate_schema.py} (100%) delete mode 100644 frictionless/validator/__spec__/__init__.py delete mode 100644 frictionless/validator/__spec__/package/__init__.py delete mode 100644 frictionless/validator/__spec__/package/test_checklist.py delete mode 100644 frictionless/validator/__spec__/package/test_parallel.py delete mode 100644 frictionless/validator/__spec__/package/test_schema.py delete mode 100644 frictionless/validator/__spec__/package/test_stats.py delete mode 100644 frictionless/validator/__spec__/resource/__init__.py delete mode 100644 frictionless/validator/__spec__/resource/test_checklist.py delete mode 100644 frictionless/validator/__spec__/resource/test_compression.py delete mode 100644 frictionless/validator/__spec__/resource/test_detector.py delete mode 100644 frictionless/validator/__spec__/resource/test_encoding.py delete mode 100644 frictionless/validator/__spec__/resource/test_file.py delete mode 100644 frictionless/validator/__spec__/resource/test_format.py delete mode 100644 frictionless/validator/__spec__/resource/test_scheme.py delete mode 100644 frictionless/validator/__spec__/resource/test_stats.py diff --git a/frictionless/validator/__spec__/package/test_general.py b/frictionless/package/__spec__/test_validate.py similarity index 52% rename from frictionless/validator/__spec__/package/test_general.py rename to frictionless/package/__spec__/test_validate.py index 485abbdac0..fdb1121508 100644 --- a/frictionless/validator/__spec__/package/test_general.py +++ b/frictionless/package/__spec__/test_validate.py @@ -1,5 +1,6 @@ import json import pathlib +from copy import deepcopy import pytest @@ -11,6 +12,7 @@ Resource, Schema, fields, + platform, ) # General @@ -302,7 +304,10 @@ def test_validate_package_using_detector_schema_sync_issue_847(): Resource( data=[["f1"], ["v1"], ["v2"], ["v3"]], schema=Schema( - fields=[fields.StringField(name="f1"), fields.StringField(name="f2")], + fields=[ + fields.StringField(name="f1"), + fields.StringField(name="f2"), + ], ), ), ] @@ -362,3 +367,313 @@ def test_package_licenses_required_path_or_name_issue_1290(): descriptor = {"resources": [], "licenses": [{"title": "title"}]} report = Package.validate_descriptor(descriptor) assert report.errors[0].note.count('license requires "path" or "name"') + + +def test_package_validate_with_skip_errors(): + ## Test runs on data with two blank-row errors, one primary-key error, see + # first test case + test_cases = [ + {"ignore": [], "expect_errors": ["blank-row", "primary-key", "blank-row"]}, + {"ignore": ["primary-key"], "expect_errors": ["blank-row", "blank-row"]}, + {"ignore": ["blank-row"], "expect_errors": ["primary-key"]}, + {"ignore": ["blank-row", "primary-key"], "expect_errors": []}, + ] + + for tc in test_cases: + with open("data/invalid/datapackage.json") as file: + package = Package(json.load(file), basepath="data/invalid") + checklist = Checklist(skip_errors=tc["ignore"]) + + report = package.validate(checklist) + + assert report.flatten(["type"]) == [[t] for t in tc["expect_errors"]] + + +# Stats + +DESCRIPTOR_SH = { + "resources": [ + { + "name": "resource1", + "path": "data/table.csv", + "hash": "sha256:a1fd6c5ff3494f697874deeb07f69f8667e903dd94a7bc062dd57550cea26da8", + "bytes": 30, + } + ] +} + + +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") +def test_package_validate_stats(): + source = deepcopy(DESCRIPTOR_SH) + package = Package(source) + report = package.validate() + assert report.valid + + +def test_package_validate_stats_invalid(): + source = deepcopy(DESCRIPTOR_SH) + source["resources"][0]["hash"] += "a" + source["resources"][0]["bytes"] += 1 + package = Package(source) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ + [None, None, "hash-count"], + [None, None, "byte-count"], + ] + + +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") +def test_package_validate_stats_size(): + source = deepcopy(DESCRIPTOR_SH) + source["resources"][0].pop("hash") + package = Package(source) + report = package.validate() + assert report.valid + + +def test_package_validate_stats_size_invalid(): + source = deepcopy(DESCRIPTOR_SH) + source["resources"][0]["bytes"] += 1 + source["resources"][0].pop("hash") + package = Package(source) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ + [None, None, "byte-count"], + ] + + +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") +def test_package_validate_stats_hash(): + source = deepcopy(DESCRIPTOR_SH) + source["resources"][0].pop("bytes") + package = Package(source) + report = package.validate() + assert report.valid + + +def test_package_validate_check_file_package_stats_hash_invalid(): + source = deepcopy(DESCRIPTOR_SH) + source["resources"][0].pop("bytes") + source["resources"][0]["hash"] += "a" + package = Package(source) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ + [None, None, "hash-count"], + ] + + +# Schema + +DESCRIPTOR_FK = { + "resources": [ + { + "name": "cities", + "data": [ + ["id", "name", "next_id"], + [1, "london", 2], + [2, "paris", 3], + [3, "rome", 4], + [4, "rio", None], + ], + "schema": { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + {"name": "next_id", "type": "integer"}, + ], + "foreignKeys": [ + { + "fields": "next_id", + "reference": {"resource": "", "fields": "id"}, + }, + { + "fields": "id", + "reference": {"resource": "people", "fields": "label"}, + }, + ], + }, + }, + { + "name": "people", + "data": [["label", "population"], [1, 8], [2, 2], [3, 3], [4, 6]], + }, + ], +} + +MULTI_FK_RESSOURCE = { + "name": "travel_time", + "data": [["from", "to", "hours"], [1, 2, 1.5], [2, 3, 8], [3, 4, 18]], + "schema": { + "fields": [ + {"name": "from", "type": "integer"}, + {"name": "to", "type": "integer"}, + {"name": "hours", "type": "number"}, + ], + "foreignKeys": [ + { + "fields": ["from", "to"], + "reference": {"resource": "cities", "fields": ["id", "next_id"]}, + } + ], + }, +} + + +def test_package_validate_schema_foreign_key_error(): + descriptor = deepcopy(DESCRIPTOR_FK) + package = Package(descriptor) + report = package.validate() + assert report.valid + + +def test_package_validate_schema_foreign_key_not_defined(): + descriptor = deepcopy(DESCRIPTOR_FK) + del descriptor["resources"][0]["schema"]["foreignKeys"] + package = Package(descriptor) + report = package.validate() + assert report.valid + + +def test_package_validate_schema_foreign_key_self_referenced_resource_violation(): + descriptor = deepcopy(DESCRIPTOR_FK) + del descriptor["resources"][0]["data"][4] + package = Package(descriptor) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ + [4, None, "foreign-key", ["3", "rome", "4"]], + ] + + +def test_package_validate_schema_foreign_key_internal_resource_violation(): + descriptor = deepcopy(DESCRIPTOR_FK) + del descriptor["resources"][1]["data"][4] + package = Package(descriptor) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ + [5, None, "foreign-key", ["4", "rio", ""]], + ] + + +def test_package_validate_schema_foreign_key_internal_resource_violation_non_existent(): + descriptor = deepcopy(DESCRIPTOR_FK) + descriptor["resources"][1]["data"] = [["label", "population"], [10, 10]] + package = Package(descriptor) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ + [2, None, "foreign-key", ["1", "london", "2"]], + [3, None, "foreign-key", ["2", "paris", "3"]], + [4, None, "foreign-key", ["3", "rome", "4"]], + [5, None, "foreign-key", ["4", "rio", ""]], + ] + + +def test_package_validate_schema_multiple_foreign_key(): + descriptor = deepcopy(DESCRIPTOR_FK) + descriptor["resources"].append(MULTI_FK_RESSOURCE) + package = Package(descriptor) + report = package.validate() + assert report.valid + + +def test_package_validate_schema_multiple_foreign_key_resource_violation_non_existent(): + descriptor = deepcopy(DESCRIPTOR_FK) + # remove London + del descriptor["resources"][0]["data"][1] + descriptor["resources"].append(MULTI_FK_RESSOURCE) + package = Package(descriptor) + report = package.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells", "note"]) == [ + [ + 2, + None, + "foreign-key", + ["1", "2", "1.5"], + 'for "from, to": values "1, 2" not found in the lookup table "cities" as "id, next_id"', + ], + ] + + +def test_package_validate_schema_multiple_foreign_key_violations(): + descriptor = deepcopy(DESCRIPTOR_FK) + # Add some wrong fks + descriptor["resources"][0]["data"][3][0] = 5 + descriptor["resources"][0]["data"][4][0] = 6 + descriptor["resources"].append(MULTI_FK_RESSOURCE) + package = Package(descriptor) + report = package.validate() + assert report.flatten( + [ + "rowNumber", + "fieldNames", + "fieldCells", + "referenceName", + "referenceFieldNames", + ] + ) == [ + [3, ["next_id"], ["3"], "", ["id"]], + [4, ["next_id"], ["4"], "", ["id"]], + [4, ["id"], ["5"], "people", ["label"]], + [5, ["id"], ["6"], "people", ["label"]], + [4, ["from", "to"], ["3", "4"], "cities", ["id", "next_id"]], + ] + + +# Bugs + + +def test_package_validate_using_detector_schema_sync_issue_847(): + package = Package( + resources=[ + Resource( + data=[["f1"], ["v1"], ["v2"], ["v3"]], + schema=Schema( + fields=[ + fields.AnyField(name="f1"), + fields.AnyField(name="f2"), + ] + ), + ), + ] + ) + for resource in package.resources: + resource.detector = Detector(schema_sync=True) + report = package.validate() + assert report.valid + + +# Parallel + +# Note: to test parallel validation, do not use foreign keys to prevent an +# automatic fallback on single-core execution + + +@pytest.mark.ci +def test_package_validate_parallel_from_dict(): + with open("data/datapackage.json") as file: + package = Package(json.load(file), basepath="data") + report = package.validate(parallel=True) + assert report.valid + + +@pytest.mark.ci +def test_package_validate_parallel_from_dict_invalid(): + with open("data/invalid/datapackage_no_foreign_key.json") as file: + package = Package(json.load(file), basepath="data/invalid") + report = package.validate(parallel=True) + assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ + [1, 3, None, "blank-row"], + [1, 3, None, "primary-key"], + [2, 4, None, "blank-row"], + ] + + +@pytest.mark.ci +def test_package_validate_with_parallel(): + package = Package("data/invalid/datapackage_no_foreign_key.json") + report = package.validate(parallel=True) + assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ + [1, 3, None, "blank-row"], + [1, 3, None, "primary-key"], + [2, 4, None, "blank-row"], + ] diff --git a/frictionless/validator/__spec__/resource/test_general.py b/frictionless/resource/__spec__/test_validate.py similarity index 61% rename from frictionless/validator/__spec__/resource/test_general.py rename to frictionless/resource/__spec__/test_validate.py index 31002ad4b1..e5a539358f 100644 --- a/frictionless/validator/__spec__/resource/test_general.py +++ b/frictionless/resource/__spec__/test_validate.py @@ -6,9 +6,12 @@ Check, Checklist, Detector, + Dialect, FrictionlessException, Resource, + Schema, errors, + platform, ) from frictionless.resources import TableResource @@ -450,3 +453,308 @@ def test_resource_validate_resource_metadata_errors_with_fields_993(): assert error.note == "descriptor is not valid" assert reasons[0].type == "resource-error" assert reasons[0].note == '"fields" should be set as "schema.fields"' + + +# Checklist + + +def test_resource_validate_bound_checklist(): + checklist = Checklist(pick_errors=["blank-label", "blank-row"]) + resource = TableResource(path="data/invalid.csv") + report = resource.validate(checklist) + assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ + [None, 3, "blank-label"], + [4, None, "blank-row"], + ] + + +# Compression + + +def test_resource_validate_compression(): + resource = TableResource(path="data/table.csv.zip") + report = resource.validate() + assert report.valid + + +def test_resource_validate_compression_explicit(): + resource = TableResource(path="data/table.csv.zip", compression="zip") + report = resource.validate() + assert report.valid + + +def test_resource_validate_compression_invalid(): + resource = TableResource(path="data/table.csv.zip", compression="bad") + report = resource.validate() + assert report.flatten(["type", "note"]) == [ + ["compression-error", 'compression "bad" is not supported'], + ] + + +# Detector + + +def test_resource_validate_detector_sync_schema(): + schema = Schema.from_descriptor( + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + ], + } + ) + detector = Detector(schema_sync=True) + resource = TableResource( + path="data/sync-schema.csv", schema=schema, detector=detector + ) + report = resource.validate() + assert report.valid + assert resource.schema.to_descriptor() == { + "fields": [ + {"name": "name", "type": "string"}, + {"name": "id", "type": "integer"}, + ], + } + + +def test_resource_validate_detector_sync_schema_invalid(): + source = [["LastName", "FirstName", "Address"], ["Test", "Tester", "23 Avenue"]] + schema = Schema.from_descriptor( + { + "fields": [ + {"name": "id", "type": "string"}, + {"name": "FirstName", "type": "string"}, + {"name": "LastName", "type": "string"}, + ] + } + ) + detector = Detector(schema_sync=True) + resource = TableResource(data=source, schema=schema, detector=detector) + report = resource.validate() + assert report.valid + + +def test_resource_validate_detector_headers_errors(): + source = [ + ["id", "last_name", "first_name", "language"], + [1, "Alex", "John", "English"], + [2, "Peters", "John", "Afrikaans"], + [3, "Smith", "Paul", None], + ] + schema = Schema.from_descriptor( + { + "fields": [ + {"name": "id", "type": "number"}, + { + "name": "language", + "type": "string", + "constraints": {"required": True}, + }, + {"name": "country", "type": "string"}, + ] + } + ) + detector = Detector(schema_sync=True) + resource = TableResource(data=source, schema=schema, detector=detector) + report = resource.validate() + assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ + [4, 4, "constraint-error", ["3", "Smith", "Paul", ""]], + ] + + +def test_resource_validate_detector_patch_schema(): + detector = Detector(schema_patch={"missingValues": ["-"]}) + resource = TableResource(path="data/table.csv", detector=detector) + report = resource.validate() + assert report.valid + assert resource.schema.to_descriptor() == { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + ], + "missingValues": ["-"], + } + + +def test_resource_validate_detector_patch_schema_fields(): + detector = Detector( + schema_patch={"fields": {"id": {"type": "string"}}, "missingValues": ["-"]} + ) + resource = TableResource(path="data/table.csv", detector=detector) + report = resource.validate() + assert report.valid + assert resource.schema.to_descriptor() == { + "fields": [ + {"name": "id", "type": "string"}, + {"name": "name", "type": "string"}, + ], + "missingValues": ["-"], + } + + +def test_resource_validate_detector_infer_type_string(): + detector = Detector(field_type="string") + resource = TableResource(path="data/table.csv", detector=detector) + report = resource.validate() + assert report.valid + assert resource.schema.to_descriptor() == { + "fields": [ + {"name": "id", "type": "string"}, + {"name": "name", "type": "string"}, + ], + } + + +def test_resource_validate_detector_infer_type_any(): + detector = Detector(field_type="any") + resource = TableResource(path="data/table.csv", detector=detector) + report = resource.validate() + assert report.valid + assert resource.schema.to_descriptor() == { + "fields": [{"name": "id", "type": "any"}, {"name": "name", "type": "any"}], + } + + +def test_resource_validate_detector_infer_names(): + dialect = Dialect(header=False) + detector = Detector(field_names=["id", "name"]) + resource = TableResource( + path="data/without-headers.csv", dialect=dialect, detector=detector + ) + report = resource.validate() + assert report.valid + assert resource.schema.fields[0].name == "id" + assert resource.schema.fields[1].name == "name" + assert resource.stats.rows == 3 + assert resource.labels == [] + assert resource.header == ["id", "name"] + + +# Encoding + + +def test_resource_validate_encoding(): + resource = TableResource(path="data/table.csv", encoding="utf-8") + report = resource.validate() + assert report.valid + + +@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") +def test_resource_validate_encoding_invalid(): + resource = TableResource(path="data/latin1.csv", encoding="utf-8") + report = resource.validate() + assert not report.valid + assert report.flatten(["type", "note"]) == [ + [ + "encoding-error", + "'utf-8' codec can't decode byte 0xa9 in position 20: invalid start byte", + ], + ] + + +# File + + +def test_resource_validate_format_non_tabular(): + resource = Resource("data/table.bad") + report = resource.validate() + assert report.valid + + +def test_resource_validate_invalid_resource_standards_v2_strict(): + report = Resource.validate_descriptor({"path": "data/table.csv"}) + assert report.flatten(["type", "note"]) == [ + ["resource-error", "'name' is a required property"], + ] + + +# Format + + +def test_resource_validate_format(): + resource = TableResource(path="data/table.csv", format="csv") + report = resource.validate() + assert report.valid + + +# Stats + + +def test_resource_validate_stats_hash(): + hash = "sha256:a1fd6c5ff3494f697874deeb07f69f8667e903dd94a7bc062dd57550cea26da8" + resource = TableResource(path="data/table.csv", hash=hash) + report = resource.validate() + assert report.task.valid + + +def test_resource_validate_stats_hash_invalid(): + hash = "6c2c61dd9b0e9c6876139a449ed87933" + resource = TableResource(path="data/table.csv", hash="bad") + report = resource.validate() + assert report.flatten(["type", "note"]) == [ + [ + "hash-count", + 'expected is "bad" and actual is "%s"' % hash, + ], + ] + + +def test_resource_validate_stats_bytes(): + resource = TableResource(path="data/table.csv", bytes=30) + report = resource.validate() + assert report.task.valid + + +def test_resource_validate_stats_bytes_invalid(): + resource = TableResource(path="data/table.csv", bytes=40) + report = resource.validate() + assert report.task.error.to_descriptor().get("rowNumber") is None + assert report.task.error.to_descriptor().get("fieldNumber") is None + assert report.flatten(["type", "note"]) == [ + ["byte-count", 'expected is "40" and actual is "30"'], + ] + + +def test_resource_validate_stats_rows(): + resource = TableResource(path="data/table.csv", rows=2) + report = resource.validate() + assert report.task.valid + + +def test_resource_validate_stats_rows_invalid(): + resource = TableResource(path="data/table.csv", rows=3) + report = resource.validate() + assert report.task.error.to_descriptor().get("rowNumber") is None + assert report.task.error.to_descriptor().get("fieldNumber") is None + assert report.flatten(["type", "note"]) == [ + ["row-count", 'expected is "3" and actual is "2"'], + ] + + +def test_resource_validate_stats_not_supported_hash_algorithm(): + resource = TableResource.from_descriptor( + { + "name": "name", + "path": "data/table.csv", + "hash": "sha1:db6ea2f8ff72a9e13e1d70c28ed1c6b42af3bb0e", + } + ) + report = resource.validate() + assert report.task.warnings == ["hash is ignored; supported algorithms: md5/sha256"] + + +# Scheme + + +def test_resource_validate_scheme(): + resource = TableResource(path="data/table.csv", scheme="file") + report = resource.validate() + assert report.valid + + +def test_resource_validate_scheme_invalid(): + resource = TableResource(path="bad://data/table.csv") + report = resource.validate() + assert report.flatten(["type", "note"]) == [ + ["scheme-error", 'scheme "bad" is not supported'], + ] diff --git a/frictionless/validator/__spec__/resource/test_dialect.py b/frictionless/resource/__spec__/test_validate_dialect.py similarity index 99% rename from frictionless/validator/__spec__/resource/test_dialect.py rename to frictionless/resource/__spec__/test_validate_dialect.py index b587ce8841..984999b0c9 100644 --- a/frictionless/validator/__spec__/resource/test_dialect.py +++ b/frictionless/resource/__spec__/test_validate_dialect.py @@ -1,7 +1,7 @@ from frictionless import Dialect, Resource, formats from frictionless.resources import TableResource -# General +# Dialect def test_resource_validate_dialect_delimiter(): diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/resource/__spec__/test_validate_schema.py similarity index 100% rename from frictionless/validator/__spec__/resource/test_schema.py rename to frictionless/resource/__spec__/test_validate_schema.py diff --git a/frictionless/validator/__spec__/__init__.py b/frictionless/validator/__spec__/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/frictionless/validator/__spec__/package/__init__.py b/frictionless/validator/__spec__/package/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/frictionless/validator/__spec__/package/test_checklist.py b/frictionless/validator/__spec__/package/test_checklist.py deleted file mode 100644 index db25ec888d..0000000000 --- a/frictionless/validator/__spec__/package/test_checklist.py +++ /dev/null @@ -1,23 +0,0 @@ -import json - -from frictionless import Checklist, Package - - -def test_package_validate_with_skip_errors(): - ## Test runs on data with two blank-row errors, one primary-key error, see - # first test case - test_cases = [ - {"ignore": [], "expect_errors": ["blank-row", "primary-key", "blank-row"]}, - {"ignore": ["primary-key"], "expect_errors": ["blank-row", "blank-row"]}, - {"ignore": ["blank-row"], "expect_errors": ["primary-key"]}, - {"ignore": ["blank-row", "primary-key"], "expect_errors": []}, - ] - - for tc in test_cases: - with open("data/invalid/datapackage.json") as file: - package = Package(json.load(file), basepath="data/invalid") - checklist = Checklist(skip_errors=tc["ignore"]) - - report = package.validate(checklist) - - assert report.flatten(["type"]) == [[t] for t in tc["expect_errors"]] diff --git a/frictionless/validator/__spec__/package/test_parallel.py b/frictionless/validator/__spec__/package/test_parallel.py deleted file mode 100644 index 0ee0e14b1e..0000000000 --- a/frictionless/validator/__spec__/package/test_parallel.py +++ /dev/null @@ -1,41 +0,0 @@ -import json - -import pytest - -from frictionless import Package - -# General - -# Note: to test parallel validation, do not use foreign keys to prevent an -# automatic fallback on single-core execution - - -@pytest.mark.ci -def test_package_validate_parallel_from_dict(): - with open("data/datapackage.json") as file: - package = Package(json.load(file), basepath="data") - report = package.validate(parallel=True) - assert report.valid - - -@pytest.mark.ci -def test_package_validate_parallel_from_dict_invalid(): - with open("data/invalid/datapackage_no_foreign_key.json") as file: - package = Package(json.load(file), basepath="data/invalid") - report = package.validate(parallel=True) - assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ - [1, 3, None, "blank-row"], - [1, 3, None, "primary-key"], - [2, 4, None, "blank-row"], - ] - - -@pytest.mark.ci -def test_package_validate_with_parallel(): - package = Package("data/invalid/datapackage_no_foreign_key.json") - report = package.validate(parallel=True) - assert report.flatten(["taskNumber", "rowNumber", "fieldNumber", "type"]) == [ - [1, 3, None, "blank-row"], - [1, 3, None, "primary-key"], - [2, 4, None, "blank-row"], - ] diff --git a/frictionless/validator/__spec__/package/test_schema.py b/frictionless/validator/__spec__/package/test_schema.py deleted file mode 100644 index 05a5f5fb63..0000000000 --- a/frictionless/validator/__spec__/package/test_schema.py +++ /dev/null @@ -1,179 +0,0 @@ -from copy import deepcopy - -from frictionless import Detector, Package, Resource, Schema, fields - -# General - - -DESCRIPTOR_FK = { - "resources": [ - { - "name": "cities", - "data": [ - ["id", "name", "next_id"], - [1, "london", 2], - [2, "paris", 3], - [3, "rome", 4], - [4, "rio", None], - ], - "schema": { - "fields": [ - {"name": "id", "type": "integer"}, - {"name": "name", "type": "string"}, - {"name": "next_id", "type": "integer"}, - ], - "foreignKeys": [ - {"fields": "next_id", "reference": {"resource": "", "fields": "id"}}, - { - "fields": "id", - "reference": {"resource": "people", "fields": "label"}, - }, - ], - }, - }, - { - "name": "people", - "data": [["label", "population"], [1, 8], [2, 2], [3, 3], [4, 6]], - }, - ], -} - -MULTI_FK_RESSOURCE = { - "name": "travel_time", - "data": [["from", "to", "hours"], [1, 2, 1.5], [2, 3, 8], [3, 4, 18]], - "schema": { - "fields": [ - {"name": "from", "type": "integer"}, - {"name": "to", "type": "integer"}, - {"name": "hours", "type": "number"}, - ], - "foreignKeys": [ - { - "fields": ["from", "to"], - "reference": {"resource": "cities", "fields": ["id", "next_id"]}, - } - ], - }, -} - - -def test_package_validate_schema_foreign_key_error(): - descriptor = deepcopy(DESCRIPTOR_FK) - package = Package(descriptor) - report = package.validate() - assert report.valid - - -def test_package_validate_schema_foreign_key_not_defined(): - descriptor = deepcopy(DESCRIPTOR_FK) - del descriptor["resources"][0]["schema"]["foreignKeys"] - package = Package(descriptor) - report = package.validate() - assert report.valid - - -def test_package_validate_schema_foreign_key_self_referenced_resource_violation(): - descriptor = deepcopy(DESCRIPTOR_FK) - del descriptor["resources"][0]["data"][4] - package = Package(descriptor) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ - [4, None, "foreign-key", ["3", "rome", "4"]], - ] - - -def test_package_validate_schema_foreign_key_internal_resource_violation(): - descriptor = deepcopy(DESCRIPTOR_FK) - del descriptor["resources"][1]["data"][4] - package = Package(descriptor) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ - [5, None, "foreign-key", ["4", "rio", ""]], - ] - - -def test_package_validate_schema_foreign_key_internal_resource_violation_non_existent(): - descriptor = deepcopy(DESCRIPTOR_FK) - descriptor["resources"][1]["data"] = [["label", "population"], [10, 10]] - package = Package(descriptor) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ - [2, None, "foreign-key", ["1", "london", "2"]], - [3, None, "foreign-key", ["2", "paris", "3"]], - [4, None, "foreign-key", ["3", "rome", "4"]], - [5, None, "foreign-key", ["4", "rio", ""]], - ] - - -def test_package_validate_schema_multiple_foreign_key(): - descriptor = deepcopy(DESCRIPTOR_FK) - descriptor["resources"].append(MULTI_FK_RESSOURCE) - package = Package(descriptor) - report = package.validate() - assert report.valid - - -def test_package_validate_schema_multiple_foreign_key_resource_violation_non_existent(): - descriptor = deepcopy(DESCRIPTOR_FK) - # remove London - del descriptor["resources"][0]["data"][1] - descriptor["resources"].append(MULTI_FK_RESSOURCE) - package = Package(descriptor) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type", "cells", "note"]) == [ - [ - 2, - None, - "foreign-key", - ["1", "2", "1.5"], - 'for "from, to": values "1, 2" not found in the lookup table "cities" as "id, next_id"', - ], - ] - - -def test_package_validate_schema_multiple_foreign_key_violations(): - descriptor = deepcopy(DESCRIPTOR_FK) - # Add some wrong fks - descriptor["resources"][0]["data"][3][0] = 5 - descriptor["resources"][0]["data"][4][0] = 6 - descriptor["resources"].append(MULTI_FK_RESSOURCE) - package = Package(descriptor) - report = package.validate() - assert report.flatten( - [ - "rowNumber", - "fieldNames", - "fieldCells", - "referenceName", - "referenceFieldNames", - ] - ) == [ - [3, ["next_id"], ["3"], "", ["id"]], - [4, ["next_id"], ["4"], "", ["id"]], - [4, ["id"], ["5"], "people", ["label"]], - [5, ["id"], ["6"], "people", ["label"]], - [4, ["from", "to"], ["3", "4"], "cities", ["id", "next_id"]], - ] - - -# Bugs - - -def test_package_validate_using_detector_schema_sync_issue_847(): - package = Package( - resources=[ - Resource( - data=[["f1"], ["v1"], ["v2"], ["v3"]], - schema=Schema( - fields=[ - fields.AnyField(name="f1"), - fields.AnyField(name="f2"), - ] - ), - ), - ] - ) - for resource in package.resources: - resource.detector = Detector(schema_sync=True) - report = package.validate() - assert report.valid diff --git a/frictionless/validator/__spec__/package/test_stats.py b/frictionless/validator/__spec__/package/test_stats.py deleted file mode 100644 index 3c921ea3cd..0000000000 --- a/frictionless/validator/__spec__/package/test_stats.py +++ /dev/null @@ -1,79 +0,0 @@ -from copy import deepcopy - -import pytest - -from frictionless import Package, platform - -# General - - -DESCRIPTOR_SH = { - "resources": [ - { - "name": "resource1", - "path": "data/table.csv", - "hash": "sha256:a1fd6c5ff3494f697874deeb07f69f8667e903dd94a7bc062dd57550cea26da8", - "bytes": 30, - } - ] -} - - -@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") -def test_package_validate_stats(): - source = deepcopy(DESCRIPTOR_SH) - package = Package(source) - report = package.validate() - assert report.valid - - -def test_package_validate_stats_invalid(): - source = deepcopy(DESCRIPTOR_SH) - source["resources"][0]["hash"] += "a" - source["resources"][0]["bytes"] += 1 - package = Package(source) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ - [None, None, "hash-count"], - [None, None, "byte-count"], - ] - - -@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") -def test_package_validate_stats_size(): - source = deepcopy(DESCRIPTOR_SH) - source["resources"][0].pop("hash") - package = Package(source) - report = package.validate() - assert report.valid - - -def test_package_validate_stats_size_invalid(): - source = deepcopy(DESCRIPTOR_SH) - source["resources"][0]["bytes"] += 1 - source["resources"][0].pop("hash") - package = Package(source) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ - [None, None, "byte-count"], - ] - - -@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") -def test_package_validate_stats_hash(): - source = deepcopy(DESCRIPTOR_SH) - source["resources"][0].pop("bytes") - package = Package(source) - report = package.validate() - assert report.valid - - -def test_package_validate_check_file_package_stats_hash_invalid(): - source = deepcopy(DESCRIPTOR_SH) - source["resources"][0].pop("bytes") - source["resources"][0]["hash"] += "a" - package = Package(source) - report = package.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ - [None, None, "hash-count"], - ] diff --git a/frictionless/validator/__spec__/resource/__init__.py b/frictionless/validator/__spec__/resource/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/frictionless/validator/__spec__/resource/test_checklist.py b/frictionless/validator/__spec__/resource/test_checklist.py deleted file mode 100644 index ed30463993..0000000000 --- a/frictionless/validator/__spec__/resource/test_checklist.py +++ /dev/null @@ -1,14 +0,0 @@ -from frictionless import Checklist -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_bound_checklist(): - checklist = Checklist(pick_errors=["blank-label", "blank-row"]) - resource = TableResource(path="data/invalid.csv") - report = resource.validate(checklist) - assert report.flatten(["rowNumber", "fieldNumber", "type"]) == [ - [None, 3, "blank-label"], - [4, None, "blank-row"], - ] diff --git a/frictionless/validator/__spec__/resource/test_compression.py b/frictionless/validator/__spec__/resource/test_compression.py deleted file mode 100644 index 8fa7bacfa4..0000000000 --- a/frictionless/validator/__spec__/resource/test_compression.py +++ /dev/null @@ -1,23 +0,0 @@ -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_compression(): - resource = TableResource(path="data/table.csv.zip") - report = resource.validate() - assert report.valid - - -def test_resource_validate_compression_explicit(): - resource = TableResource(path="data/table.csv.zip", compression="zip") - report = resource.validate() - assert report.valid - - -def test_resource_validate_compression_invalid(): - resource = TableResource(path="data/table.csv.zip", compression="bad") - report = resource.validate() - assert report.flatten(["type", "note"]) == [ - ["compression-error", 'compression "bad" is not supported'], - ] diff --git a/frictionless/validator/__spec__/resource/test_detector.py b/frictionless/validator/__spec__/resource/test_detector.py deleted file mode 100644 index c136cd9e62..0000000000 --- a/frictionless/validator/__spec__/resource/test_detector.py +++ /dev/null @@ -1,130 +0,0 @@ -from frictionless import Detector, Dialect, Schema -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_detector_sync_schema(): - schema = Schema.from_descriptor( - { - "fields": [ - {"name": "id", "type": "integer"}, - {"name": "name", "type": "string"}, - ], - } - ) - detector = Detector(schema_sync=True) - resource = TableResource( - path="data/sync-schema.csv", schema=schema, detector=detector - ) - report = resource.validate() - assert report.valid - assert resource.schema.to_descriptor() == { - "fields": [ - {"name": "name", "type": "string"}, - {"name": "id", "type": "integer"}, - ], - } - - -def test_resource_validate_detector_sync_schema_invalid(): - source = [["LastName", "FirstName", "Address"], ["Test", "Tester", "23 Avenue"]] - schema = Schema.from_descriptor( - { - "fields": [ - {"name": "id", "type": "string"}, - {"name": "FirstName", "type": "string"}, - {"name": "LastName", "type": "string"}, - ] - } - ) - detector = Detector(schema_sync=True) - resource = TableResource(data=source, schema=schema, detector=detector) - report = resource.validate() - assert report.valid - - -def test_resource_validate_detector_headers_errors(): - source = [ - ["id", "last_name", "first_name", "language"], - [1, "Alex", "John", "English"], - [2, "Peters", "John", "Afrikaans"], - [3, "Smith", "Paul", None], - ] - schema = Schema.from_descriptor( - { - "fields": [ - {"name": "id", "type": "number"}, - {"name": "language", "type": "string", "constraints": {"required": True}}, - {"name": "country", "type": "string"}, - ] - } - ) - detector = Detector(schema_sync=True) - resource = TableResource(data=source, schema=schema, detector=detector) - report = resource.validate() - assert report.flatten(["rowNumber", "fieldNumber", "type", "cells"]) == [ - [4, 4, "constraint-error", ["3", "Smith", "Paul", ""]], - ] - - -def test_resource_validate_detector_patch_schema(): - detector = Detector(schema_patch={"missingValues": ["-"]}) - resource = TableResource(path="data/table.csv", detector=detector) - report = resource.validate() - assert report.valid - assert resource.schema.to_descriptor() == { - "fields": [ - {"name": "id", "type": "integer"}, - {"name": "name", "type": "string"}, - ], - "missingValues": ["-"], - } - - -def test_resource_validate_detector_patch_schema_fields(): - detector = Detector( - schema_patch={"fields": {"id": {"type": "string"}}, "missingValues": ["-"]} - ) - resource = TableResource(path="data/table.csv", detector=detector) - report = resource.validate() - assert report.valid - assert resource.schema.to_descriptor() == { - "fields": [{"name": "id", "type": "string"}, {"name": "name", "type": "string"}], - "missingValues": ["-"], - } - - -def test_resource_validate_detector_infer_type_string(): - detector = Detector(field_type="string") - resource = TableResource(path="data/table.csv", detector=detector) - report = resource.validate() - assert report.valid - assert resource.schema.to_descriptor() == { - "fields": [{"name": "id", "type": "string"}, {"name": "name", "type": "string"}], - } - - -def test_resource_validate_detector_infer_type_any(): - detector = Detector(field_type="any") - resource = TableResource(path="data/table.csv", detector=detector) - report = resource.validate() - assert report.valid - assert resource.schema.to_descriptor() == { - "fields": [{"name": "id", "type": "any"}, {"name": "name", "type": "any"}], - } - - -def test_resource_validate_detector_infer_names(): - dialect = Dialect(header=False) - detector = Detector(field_names=["id", "name"]) - resource = TableResource( - path="data/without-headers.csv", dialect=dialect, detector=detector - ) - report = resource.validate() - assert report.valid - assert resource.schema.fields[0].name == "id" - assert resource.schema.fields[1].name == "name" - assert resource.stats.rows == 3 - assert resource.labels == [] - assert resource.header == ["id", "name"] diff --git a/frictionless/validator/__spec__/resource/test_encoding.py b/frictionless/validator/__spec__/resource/test_encoding.py deleted file mode 100644 index 9780c23172..0000000000 --- a/frictionless/validator/__spec__/resource/test_encoding.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from frictionless import platform -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_encoding(): - resource = TableResource(path="data/table.csv", encoding="utf-8") - report = resource.validate() - assert report.valid - - -@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows") -def test_resource_validate_encoding_invalid(): - resource = TableResource(path="data/latin1.csv", encoding="utf-8") - report = resource.validate() - assert not report.valid - assert report.flatten(["type", "note"]) == [ - [ - "encoding-error", - "'utf-8' codec can't decode byte 0xa9 in position 20: invalid start byte", - ], - ] diff --git a/frictionless/validator/__spec__/resource/test_file.py b/frictionless/validator/__spec__/resource/test_file.py deleted file mode 100644 index eadad13769..0000000000 --- a/frictionless/validator/__spec__/resource/test_file.py +++ /dev/null @@ -1,16 +0,0 @@ -from frictionless import Resource - -# General - - -def test_resource_validate_format_non_tabular(): - resource = Resource("data/table.bad") - report = resource.validate() - assert report.valid - - -def test_resource_validate_invalid_resource_standards_v2_strict(): - report = Resource.validate_descriptor({"path": "data/table.csv"}) - assert report.flatten(["type", "note"]) == [ - ["resource-error", "'name' is a required property"], - ] diff --git a/frictionless/validator/__spec__/resource/test_format.py b/frictionless/validator/__spec__/resource/test_format.py deleted file mode 100644 index 002df585d1..0000000000 --- a/frictionless/validator/__spec__/resource/test_format.py +++ /dev/null @@ -1,9 +0,0 @@ -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_format(): - resource = TableResource(path="data/table.csv", format="csv") - report = resource.validate() - assert report.valid diff --git a/frictionless/validator/__spec__/resource/test_scheme.py b/frictionless/validator/__spec__/resource/test_scheme.py deleted file mode 100644 index fdcca6c8e8..0000000000 --- a/frictionless/validator/__spec__/resource/test_scheme.py +++ /dev/null @@ -1,17 +0,0 @@ -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_scheme(): - resource = TableResource(path="data/table.csv", scheme="file") - report = resource.validate() - assert report.valid - - -def test_resource_validate_scheme_invalid(): - resource = TableResource(path="bad://data/table.csv") - report = resource.validate() - assert report.flatten(["type", "note"]) == [ - ["scheme-error", 'scheme "bad" is not supported'], - ] diff --git a/frictionless/validator/__spec__/resource/test_stats.py b/frictionless/validator/__spec__/resource/test_stats.py deleted file mode 100644 index ab19b9239c..0000000000 --- a/frictionless/validator/__spec__/resource/test_stats.py +++ /dev/null @@ -1,66 +0,0 @@ -from frictionless.resources import TableResource - -# General - - -def test_resource_validate_stats_hash(): - hash = "sha256:a1fd6c5ff3494f697874deeb07f69f8667e903dd94a7bc062dd57550cea26da8" - resource = TableResource(path="data/table.csv", hash=hash) - report = resource.validate() - assert report.task.valid - - -def test_resource_validate_stats_hash_invalid(): - hash = "6c2c61dd9b0e9c6876139a449ed87933" - resource = TableResource(path="data/table.csv", hash="bad") - report = resource.validate() - assert report.flatten(["type", "note"]) == [ - [ - "hash-count", - 'expected is "bad" and actual is "%s"' % hash, - ], - ] - - -def test_resource_validate_stats_bytes(): - resource = TableResource(path="data/table.csv", bytes=30) - report = resource.validate() - assert report.task.valid - - -def test_resource_validate_stats_bytes_invalid(): - resource = TableResource(path="data/table.csv", bytes=40) - report = resource.validate() - assert report.task.error.to_descriptor().get("rowNumber") is None - assert report.task.error.to_descriptor().get("fieldNumber") is None - assert report.flatten(["type", "note"]) == [ - ["byte-count", 'expected is "40" and actual is "30"'], - ] - - -def test_resource_validate_stats_rows(): - resource = TableResource(path="data/table.csv", rows=2) - report = resource.validate() - assert report.task.valid - - -def test_resource_validate_stats_rows_invalid(): - resource = TableResource(path="data/table.csv", rows=3) - report = resource.validate() - assert report.task.error.to_descriptor().get("rowNumber") is None - assert report.task.error.to_descriptor().get("fieldNumber") is None - assert report.flatten(["type", "note"]) == [ - ["row-count", 'expected is "3" and actual is "2"'], - ] - - -def test_resource_validate_stats_not_supported_hash_algorithm(): - resource = TableResource.from_descriptor( - { - "name": "name", - "path": "data/table.csv", - "hash": "sha1:db6ea2f8ff72a9e13e1d70c28ed1c6b42af3bb0e", - } - ) - report = resource.validate() - assert report.task.warnings == ["hash is ignored; supported algorithms: md5/sha256"] From ee4ad2f95134f584d7eb2af3f267074d0d42dd13 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 17 Jan 2025 10:37:40 +0100 Subject: [PATCH 07/23] first attempt --- frictionless/resources/table.py | 33 ++----------- frictionless/table/row.py | 85 ++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 62 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 5891ace0b9..d47ab131a2 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -14,6 +14,7 @@ from ..indexer import Indexer from ..platform import platform from ..resource import Resource +from ..schema.fields_info import FieldsInfo from ..system import system from ..table import Header, Lookup, Row, Table from ..transformer import Transformer @@ -263,24 +264,7 @@ def __open_lookup(self): self.__lookup[source_name][source_key].add(cells) def __open_row_stream(self): - # TODO: we need to rework this field_info / row code - # During row streaming we create a field info structure - # This structure is optimized and detached version of schema.fields - # We create all data structures in-advance to share them between rows - - # Create field info - field_number = 0 - field_info: Dict[str, Any] = {"names": [], "objects": [], "mapping": {}} - for field in self.schema.fields: - field_number += 1 - field_info["names"].append(field.name) - field_info["objects"].append(field.to_copy()) - field_info["mapping"][field.name] = ( - field, - field_number, - field.create_cell_reader(), - field.create_cell_writer(), - ) + field_info = FieldsInfo(self.schema.fields) # Create state memory_unique: Dict[str, Any] = {} @@ -403,13 +387,13 @@ def row_stream(): self.__row_stream = row_stream() def remove_missing_required_label_from_field_info( - self, field: Field, field_info: Dict[str, Any] + self, field: Field, fields_info: FieldsInfo ): is_case_sensitive = self.dialect.header_case if self.label_is_missing( - field.name, field_info["names"], self.labels, is_case_sensitive + field.name, fields_info.ls(), self.labels, is_case_sensitive ): - self.remove_field_from_field_info(field.name, field_info) + fields_info.rm(field.name) @staticmethod def label_is_missing( @@ -430,13 +414,6 @@ def label_is_missing( return field_name not in table_labels and field_name in expected_field_names - @staticmethod - def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): - field_index = field_info["names"].index(field_name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field_name] - def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]: """Create a tuple containg all cells from a given row associated to primary keys""" diff --git a/frictionless/table/row.py b/frictionless/table/row.py index b2947ba677..780d454e15 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -6,6 +6,7 @@ from .. import errors, helpers from ..platform import platform +from ..schema.fields_info import FieldsInfo # NOTE: # Currently dict.update/setdefault/pop/popitem/clear is not disabled (can be confusing) @@ -36,11 +37,11 @@ def __init__( self, cells: List[Any], *, - field_info: Dict[str, Any], + field_info: FieldsInfo, row_number: int, ): self.__cells = cells - self.__field_info = field_info + self.__fields_info = field_info self.__row_number = row_number self.__processed: bool = False self.__blank_cells: Dict[str, Any] = {} @@ -61,7 +62,7 @@ def __repr__(self): def __setitem__(self, key: str, value: Any): try: - _, field_number, _, _ = self.__field_info["mapping"][key] + field_number = self.__fields_info.get(key).field_number except KeyError: raise KeyError(f"Row does not have a field {key}") if len(self.__cells) < field_number: @@ -73,38 +74,38 @@ def __missing__(self, key: str): return self.__process(key) def __iter__(self): - return iter(self.__field_info["names"]) + return iter(self.__fields_info.ls()) def __len__(self): - return len(self.__field_info["names"]) + return len(self.__fields_info.ls()) def __contains__(self, key: object): - return key in self.__field_info["mapping"] + return key in self.__fields_info.ls() def __reversed__(self): - return reversed(self.__field_info["names"]) + return reversed(self.__fields_info.ls()) def keys(self): - return iter(self.__field_info["names"]) + return iter(self.__fields_info.ls()) def values(self): # type: ignore - for name in self.__field_info["names"]: + for name in self.__fields_info.ls(): yield self[name] def items(self): # type: ignore - for name in self.__field_info["names"]: + for name in self.__fields_info.ls(): yield (name, self[name]) def get(self, key: str, default: Optional[Any] = None): - if key not in self.__field_info["names"]: + if key not in self.__fields_info.ls(): return default return self[key] @cached_property def cells(self): """ - Returns: - Field[]: table schema fields + .ls(): + Field[]: table schema fields """ return self.__cells @@ -114,7 +115,7 @@ def fields(self): Returns: Field[]: table schema fields """ - return self.__field_info["objects"] + return self.__fields_info.get_copies() @cached_property def field_names(self) -> List[str]: @@ -122,7 +123,7 @@ def field_names(self) -> List[str]: Returns: str[]: field names """ - return self.__field_info["names"] + return self.__fields_info.ls() @cached_property def field_numbers(self): @@ -130,7 +131,7 @@ def field_numbers(self): Returns: str[]: field numbers """ - return list(range(1, len(self.__field_info["names"]) + 1)) + return list(range(1, len(self.__fields_info.ls()) + 1)) @cached_property def row_number(self) -> int: @@ -201,14 +202,18 @@ def to_list(self, *, json: bool = False, types: Optional[List[str]] = None): # Prepare self.__process() - result = [self[name] for name in self.__field_info["names"]] + result = [self[name] for name in self.__fields_info.ls()] if types is None and json: types = platform.frictionless_formats.JsonParser.supported_types # Convert if types is not None: - for index, field_mapping in enumerate(self.__field_info["mapping"].values()): - field, _, _, cell_writer = field_mapping + field_names = self.__fields_info.ls() + for index, field_name in enumerate(field_names): + field_info = self.__fields_info.get(field_name) + field = field_info.field + cell_writer = field_info.cell_writer + # Here we can optimize performance if we use a types mapping if field.type in types: continue @@ -223,7 +228,11 @@ def to_list(self, *, json: bool = False, types: Optional[List[str]] = None): return result def to_dict( - self, *, csv: bool = False, json: bool = False, types: Optional[List[str]] = None + self, + *, + csv: bool = False, + json: bool = False, + types: Optional[List[str]] = None, ) -> Dict[str, Any]: """ Parameters: @@ -235,7 +244,7 @@ def to_dict( # Prepare self.__process() - result = {name: self[name] for name in self.__field_info["names"]} + result = {name: self[name] for name in self.__fields_info.ls()} if types is None and json: types = platform.frictionless_formats.JsonParser.supported_types if types is None and csv: @@ -243,8 +252,12 @@ def to_dict( # Convert if types is not None: - for field_mapping in self.__field_info["mapping"].values(): - field, _, _, cell_writer = field_mapping + field_names = self.__fields_info.ls() + for field_name in field_names: + field_info = self.__fields_info.get(field_name) + field = field_info.field + cell_writer = field_info.cell_writer + # Here we can optimize performance if we use a types mapping if field.type not in types: cell = result[field.name] @@ -268,26 +281,30 @@ def __process(self, key: Optional[str] = None): # Prepare context cells = self.__cells to_str = lambda v: str(v) if v is not None else "" # type: ignore - fields = self.__field_info["objects"] - field_mapping = self.__field_info["mapping"] - iterator = zip_longest(field_mapping.values(), cells) + fields = self.__fields_info.get_copies() + names = self.__fields_info.ls() + field_infos = [self.__fields_info.get(name) for name in names] + iterator = zip_longest(field_infos, cells) is_empty = not bool(super().__len__()) + if key: try: - field, field_number, cell_reader, cell_writer = self.__field_info[ - "mapping" - ][key] - except KeyError: + field_info = self.__fields_info.get(key) + field_number = field_info.field_number + except ValueError: raise KeyError(f"Row does not have a field {key}") cell = cells[field_number - 1] if len(cells) >= field_number else None - iterator = zip([(field, field_number, cell_reader, cell_writer)], [cell]) + iterator = zip([field_info], [cell]) # Iterate cells - for field_mapping, source in iterator: + for field_info, source in iterator: # Prepare context - if field_mapping is None: + if field_info is None: break - field, field_number, cell_reader, _ = field_mapping + field = field_info.field + field_number = field_info.field_number + cell_reader = field_info.cell_reader + if not is_empty and super().__contains__(field.name): continue From 39aa0f2afdf88c210d132e86d8b2a74ce4c6794a Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 24 Jan 2025 15:14:15 +0100 Subject: [PATCH 08/23] squash! first attempt --- frictionless/schema/fields_info.py | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 frictionless/schema/fields_info.py diff --git a/frictionless/schema/fields_info.py b/frictionless/schema/fields_info.py new file mode 100644 index 0000000000..081e435600 --- /dev/null +++ b/frictionless/schema/fields_info.py @@ -0,0 +1,47 @@ +from typing import List + +from frictionless.schema.field import Field + + +class FieldInfo: + """Private class to store additional data to a field""" + + def __init__(self, field: Field, field_number: int): + """field_number: 1-indexed rank of the field""" + self.field = field + self.field_number = field_number + self.cell_reader = field.create_cell_reader() + self.cell_writer = field.create_cell_writer() + + +class FieldsInfo: + def __init__(self, fields: List[Field]): + self._fields: List[FieldInfo] = [ + FieldInfo(field, i + 1) for i, field in enumerate(fields) + ] + + def ls(self) -> List[str]: + """List all field names""" + return [fi.field.name for fi in self._fields] + + def get(self, field_name: str) -> FieldInfo: + """Get a Field by its name + + Raises: + ValueError: Field with name fieldname does not exist + """ + try: + return next(fi for fi in self._fields if fi.field.name == field_name) + except StopIteration: + raise ValueError(f"'{field_name}' is not in fields data") + + def get_copies(self) -> List[Field]: + """Return field copies""" + return [fi.field.to_copy() for fi in self._fields] + + def rm(self, field_name: str): + try: + i = self.ls().index(field_name) + del self._fields[i] + except ValueError: + raise ValueError(f"'{field_name}' is not in fields data") From 70aadac6d2bb23be42e3e148ee6e4e2bfce06eb7 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 24 Jan 2025 15:25:24 +0100 Subject: [PATCH 09/23] =?UTF-8?q?=F0=9F=94=B5=20Mv=20to=20resources/table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/resources/table.py | 51 +++++++++++++++++++++++++++++- frictionless/schema/fields_info.py | 44 -------------------------- frictionless/table/row.py | 6 ++-- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index d47ab131a2..9ca502f818 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -14,7 +14,6 @@ from ..indexer import Indexer from ..platform import platform from ..resource import Resource -from ..schema.fields_info import FieldsInfo from ..system import system from ..table import Header, Lookup, Row, Table from ..transformer import Transformer @@ -677,3 +676,53 @@ def write( self, target: Optional[Union[Resource, Any]] = None, **options: Any ) -> TableResource: return self.write_table(target, **options) + + +class _FieldInfo: + """Private class to store additional data alongside a field.""" + + def __init__(self, field: Field, field_number: int): + """field_number: 1-indexed rank of the field""" + self.field = field + self.field_number = field_number + self.cell_reader = field.create_cell_reader() + self.cell_writer = field.create_cell_writer() + + +class FieldsInfo: + """Helper class to store additional data to a collection of fields + + This class is not Public API, and should be used only in non-public + interfaces. + """ + + def __init__(self, fields: List[Field]): + self._fields: List[_FieldInfo] = [ + _FieldInfo(field, i + 1) for i, field in enumerate(fields) + ] + + def ls(self) -> List[str]: + """List all field names""" + return [fi.field.name for fi in self._fields] + + def get(self, field_name: str) -> _FieldInfo: + """Get a Field by its name + + Raises: + ValueError: Field with name fieldname does not exist + """ + try: + return next(fi for fi in self._fields if fi.field.name == field_name) + except StopIteration: + raise ValueError(f"'{field_name}' is not in fields data") + + def get_copies(self) -> List[Field]: + """Return field copies""" + return [fi.field.to_copy() for fi in self._fields] + + def rm(self, field_name: str): + try: + i = self.ls().index(field_name) + del self._fields[i] + except ValueError: + raise ValueError(f"'{field_name}' is not in fields data") diff --git a/frictionless/schema/fields_info.py b/frictionless/schema/fields_info.py index 081e435600..0dc1c83ba7 100644 --- a/frictionless/schema/fields_info.py +++ b/frictionless/schema/fields_info.py @@ -1,47 +1,3 @@ from typing import List from frictionless.schema.field import Field - - -class FieldInfo: - """Private class to store additional data to a field""" - - def __init__(self, field: Field, field_number: int): - """field_number: 1-indexed rank of the field""" - self.field = field - self.field_number = field_number - self.cell_reader = field.create_cell_reader() - self.cell_writer = field.create_cell_writer() - - -class FieldsInfo: - def __init__(self, fields: List[Field]): - self._fields: List[FieldInfo] = [ - FieldInfo(field, i + 1) for i, field in enumerate(fields) - ] - - def ls(self) -> List[str]: - """List all field names""" - return [fi.field.name for fi in self._fields] - - def get(self, field_name: str) -> FieldInfo: - """Get a Field by its name - - Raises: - ValueError: Field with name fieldname does not exist - """ - try: - return next(fi for fi in self._fields if fi.field.name == field_name) - except StopIteration: - raise ValueError(f"'{field_name}' is not in fields data") - - def get_copies(self) -> List[Field]: - """Return field copies""" - return [fi.field.to_copy() for fi in self._fields] - - def rm(self, field_name: str): - try: - i = self.ls().index(field_name) - del self._fields[i] - except ValueError: - raise ValueError(f"'{field_name}' is not in fields data") diff --git a/frictionless/table/row.py b/frictionless/table/row.py index 780d454e15..74e7883c2f 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -2,16 +2,18 @@ from functools import cached_property from itertools import zip_longest -from typing import Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional from .. import errors, helpers from ..platform import platform -from ..schema.fields_info import FieldsInfo # NOTE: # Currently dict.update/setdefault/pop/popitem/clear is not disabled (can be confusing) # We can consider adding row.header property to provide more comprehensive API +if TYPE_CHECKING: + from ..resources.table import FieldsInfo + # TODO: add types class Row(Dict[str, Any]): From add8dacf7c790cd1dd0b924c84f6012131e8f9b2 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 15:53:16 +0100 Subject: [PATCH 10/23] =?UTF-8?q?=F0=9F=94=B5=20rename?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/detector/detector.py | 6 +++--- frictionless/resources/table.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index fc3e47e0a1..84acdccf88 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -415,8 +415,8 @@ def detect_schema( note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - mapped_fields = self.mapped_schema_fields_names( - schema.fields, # type: ignore + mapped_fields = self.map_schema_fields_by_name( + schema.fields, case_sensitive, ) @@ -445,7 +445,7 @@ def detect_schema( return schema @staticmethod - def mapped_schema_fields_names( + def map_schema_fields_by_name( fields: List[Field], case_sensitive: bool ) -> Dict[str, Field]: """Create a dictionnary to map field names with schema fields""" diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 9ca502f818..ca2d69e231 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -382,7 +382,6 @@ def row_stream(): for field in self.schema.fields: self.remove_missing_required_label_from_field_info(field, field_info) - # Create row stream self.__row_stream = row_stream() def remove_missing_required_label_from_field_info( From e26393b4349137ccf12515ffeb8e3676c77cd74b Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Mon, 27 Jan 2025 17:31:21 +0100 Subject: [PATCH 11/23] Schema sync functionnality inside FieldsInfo Test passes, surprisingly. No special effort has been made to support `header_case` option, or "required" columns with `schema_sync` --- frictionless/detector/detector.py | 45 +++++++++++----------- frictionless/resources/table.py | 62 ++++++++++++++++++++++--------- frictionless/table/row.py | 8 ++-- 3 files changed, 72 insertions(+), 43 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 84acdccf88..fb7b421d16 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -404,31 +404,30 @@ def detect_schema( schema.fields = fields # type: ignore # Sync schema - if self.schema_sync: - if labels: - case_sensitive = options["header_case"] + if self.schema_sync and labels: + case_sensitive = options["header_case"] - if not case_sensitive: - labels = [label.lower() for label in labels] + if not case_sensitive: + labels = [label.lower() for label in labels] - if len(labels) != len(set(labels)): - note = '"schema_sync" requires unique labels in the header' - raise FrictionlessException(note) + if len(labels) != len(set(labels)): + note = '"schema_sync" requires unique labels in the header' + raise FrictionlessException(note) - mapped_fields = self.map_schema_fields_by_name( - schema.fields, - case_sensitive, - ) + mapped_fields = self.map_schema_fields_by_name( + schema.fields, + case_sensitive, + ) - self.rearrange_schema_fields_given_labels( - mapped_fields, - schema, - labels, - ) + self.rearrange_schema_fields_given_labels( + mapped_fields, + schema, + labels, + ) - self.add_missing_required_labels_to_schema_fields( - mapped_fields, schema, labels, case_sensitive - ) + self.add_missing_required_labels_to_schema_fields( + mapped_fields, schema, labels, case_sensitive + ) # Patch schema if self.schema_patch: @@ -460,8 +459,10 @@ def rearrange_schema_fields_given_labels( schema: Schema, labels: List[str], ): - """Rearrange fields according to the order of labels. All fields - missing from labels are dropped""" + """Rearrange fields according to the order of labels. + All fields missing from labels are dropped. + Any extra-field is filled in with a default `"type": "any"` field. + """ schema.clear_fields() for name in labels: diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index ca2d69e231..a3749fdd97 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -263,7 +263,9 @@ def __open_lookup(self): self.__lookup[source_name][source_key].add(cells) def __open_row_stream(self): - field_info = FieldsInfo(self.schema.fields) + fields_info = FieldsInfo( + self.schema.fields, self.labels, self.detector.schema_sync + ) # Create state memory_unique: Dict[str, Any] = {} @@ -296,7 +298,7 @@ def row_stream(): row = Row( cells, - field_info=field_info, + fields_info=fields_info, row_number=row_number, ) @@ -378,9 +380,9 @@ def row_stream(): if self.detector.schema_sync: # Missing required labels are not included in the - # field_info parameter used for row creation + # fields_info parameter used for row creation for field in self.schema.fields: - self.remove_missing_required_label_from_field_info(field, field_info) + self.remove_missing_required_label_from_field_info(field, fields_info) self.__row_stream = row_stream() @@ -415,7 +417,9 @@ def label_is_missing( def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]: """Create a tuple containg all cells from a given row associated to primary keys""" - return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive)) + return tuple( + row[label] for label in self.primary_key_labels(row, case_sensitive) + ) def primary_key_labels( self, @@ -689,39 +693,63 @@ def __init__(self, field: Field, field_number: int): class FieldsInfo: - """Helper class to store additional data to a collection of fields + """Helper class for linking columns to schema fields. + + It abstracts away the different ways of making this link. In particular, the + reference may be the schema (`detector.schema_sync = False`), or the labels + (`detector.schema_sync = True`). This class is not Public API, and should be used only in non-public interfaces. """ - def __init__(self, fields: List[Field]): - self._fields: List[_FieldInfo] = [ - _FieldInfo(field, i + 1) for i, field in enumerate(fields) - ] + def __init__( + self, fields: List[Field], labels: Optional[List[str]], schema_sync: bool + ): + if schema_sync and labels: + self._expected_fields: List[_FieldInfo] = [] + if len(labels) != len(set(labels)): + note = '"schema_sync" requires unique labels in the header' + raise FrictionlessException(note) + + for label_index, label in enumerate(labels): + try: + field = next(f for f in fields if f.name == label) + except StopIteration: + field = Field.from_descriptor({"name": label, "type": "any"}) + self._expected_fields.append(_FieldInfo(field, label_index + 1)) + else: + self._expected_fields = [ + _FieldInfo(field, i + 1) for i, field in enumerate(fields) + ] def ls(self) -> List[str]: - """List all field names""" - return [fi.field.name for fi in self._fields] + """List all column names""" + return [fi.field.name for fi in self._expected_fields] def get(self, field_name: str) -> _FieldInfo: """Get a Field by its name + In case no field with field_name exists, the behavior depends on + the `detector.schema_sync` option: + Raises: - ValueError: Field with name fieldname does not exist + ValueError """ try: - return next(fi for fi in self._fields if fi.field.name == field_name) + return next( + fi for fi in self._expected_fields if fi.field.name == field_name + ) except StopIteration: - raise ValueError(f"'{field_name}' is not in fields data") + raise ValueError(f"{field_name} is missing from expected fields") def get_copies(self) -> List[Field]: """Return field copies""" - return [fi.field.to_copy() for fi in self._fields] + return [fi.field.to_copy() for fi in self._expected_fields] def rm(self, field_name: str): try: i = self.ls().index(field_name) - del self._fields[i] + del self._expected_fields[i] except ValueError: raise ValueError(f"'{field_name}' is not in fields data") diff --git a/frictionless/table/row.py b/frictionless/table/row.py index 74e7883c2f..b817b51759 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -39,11 +39,11 @@ def __init__( self, cells: List[Any], *, - field_info: FieldsInfo, + fields_info: FieldsInfo, row_number: int, ): self.__cells = cells - self.__fields_info = field_info + self.__fields_info = fields_info self.__row_number = row_number self.__processed: bool = False self.__blank_cells: Dict[str, Any] = {} @@ -65,7 +65,7 @@ def __repr__(self): def __setitem__(self, key: str, value: Any): try: field_number = self.__fields_info.get(key).field_number - except KeyError: + except ValueError: raise KeyError(f"Row does not have a field {key}") if len(self.__cells) < field_number: self.__cells.extend([None] * (field_number - len(self.__cells))) @@ -87,7 +87,7 @@ def __contains__(self, key: object): def __reversed__(self): return reversed(self.__fields_info.ls()) - def keys(self): + def keys(self): # type: ignore return iter(self.__fields_info.ls()) def values(self): # type: ignore From 8b72ae802f10799700809f528398818e2800df56 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Wed, 29 Jan 2025 15:13:33 +0100 Subject: [PATCH 12/23] =?UTF-8?q?=F0=9F=94=B5=20remove=20empty=20/=20unuse?= =?UTF-8?q?d=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/schema/fields_info.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 frictionless/schema/fields_info.py diff --git a/frictionless/schema/fields_info.py b/frictionless/schema/fields_info.py deleted file mode 100644 index 0dc1c83ba7..0000000000 --- a/frictionless/schema/fields_info.py +++ /dev/null @@ -1,3 +0,0 @@ -from typing import List - -from frictionless.schema.field import Field From c7479c66864a52a822e3e7e4bc8a2f8fd315b86f Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Wed, 29 Jan 2025 18:10:35 +0100 Subject: [PATCH 13/23] =?UTF-8?q?=F0=9F=9F=A2=20Test=20passes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TODO still some tidy up : - Remove FieldsInfo, use header instead - Less error-prone way for `_normalize` --- frictionless/detector/detector.py | 83 ---------- .../resource/__spec__/test_validate.py | 6 - .../resource/__spec__/test_validate_schema.py | 2 +- frictionless/resources/table.py | 26 ++- frictionless/table/header.py | 149 ++++++++++++++---- 5 files changed, 127 insertions(+), 139 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index fb7b421d16..cd771923b4 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -9,7 +9,6 @@ from .. import helpers, settings from ..dialect import Dialect -from ..exception import FrictionlessException from ..fields import AnyField from ..metadata import Metadata from ..platform import platform @@ -403,32 +402,6 @@ def detect_schema( fields[index] = AnyField(name=name, schema=schema) # type: ignore schema.fields = fields # type: ignore - # Sync schema - if self.schema_sync and labels: - case_sensitive = options["header_case"] - - if not case_sensitive: - labels = [label.lower() for label in labels] - - if len(labels) != len(set(labels)): - note = '"schema_sync" requires unique labels in the header' - raise FrictionlessException(note) - - mapped_fields = self.map_schema_fields_by_name( - schema.fields, - case_sensitive, - ) - - self.rearrange_schema_fields_given_labels( - mapped_fields, - schema, - labels, - ) - - self.add_missing_required_labels_to_schema_fields( - mapped_fields, schema, labels, case_sensitive - ) - # Patch schema if self.schema_patch: patch = deepcopy(self.schema_patch) @@ -442,59 +415,3 @@ def detect_schema( schema = Schema.from_descriptor(descriptor) return schema - - @staticmethod - def map_schema_fields_by_name( - fields: List[Field], case_sensitive: bool - ) -> Dict[str, Field]: - """Create a dictionnary to map field names with schema fields""" - if case_sensitive: - return {field.name: field for field in fields} - else: - return {field.name.lower(): field for field in fields} - - @staticmethod - def rearrange_schema_fields_given_labels( - fields_mapping: Dict[str, Field], - schema: Schema, - labels: List[str], - ): - """Rearrange fields according to the order of labels. - All fields missing from labels are dropped. - Any extra-field is filled in with a default `"type": "any"` field. - """ - schema.clear_fields() - - for name in labels: - default_field = Field.from_descriptor({"name": name, "type": "any"}) - field = fields_mapping.get(name, default_field) - schema.add_field(field) - - def add_missing_required_labels_to_schema_fields( - self, - fields_mapping: Dict[str, Field], - schema: Schema, - labels: List[str], - case_sensitive: bool, - ): - """This method aims to add missing required labels and - primary key field not in labels to schema fields. - """ - for name, field in fields_mapping.items(): - if ( - self.field_is_required(field, schema, case_sensitive) - and name not in labels - ): - schema.add_field(field) - - @staticmethod - def field_is_required( - field: Field, - schema: Schema, - case_sensitive: bool, - ) -> bool: - if case_sensitive: - return field.required or field.name in schema.primary_key - else: - lower_primary_key = [pk.lower() for pk in schema.primary_key] - return field.required or field.name.lower() in lower_primary_key diff --git a/frictionless/resource/__spec__/test_validate.py b/frictionless/resource/__spec__/test_validate.py index e5a539358f..0f2af067ee 100644 --- a/frictionless/resource/__spec__/test_validate.py +++ b/frictionless/resource/__spec__/test_validate.py @@ -509,12 +509,6 @@ def test_resource_validate_detector_sync_schema(): ) report = resource.validate() assert report.valid - assert resource.schema.to_descriptor() == { - "fields": [ - {"name": "name", "type": "string"}, - {"name": "id", "type": "integer"}, - ], - } def test_resource_validate_detector_sync_schema_invalid(): diff --git a/frictionless/resource/__spec__/test_validate_schema.py b/frictionless/resource/__spec__/test_validate_schema.py index b386f05c3b..7dc01953c8 100644 --- a/frictionless/resource/__spec__/test_validate_schema.py +++ b/frictionless/resource/__spec__/test_validate_schema.py @@ -435,7 +435,7 @@ def test_validate_resource_ignoring_header_case_issue_1635(): dialect=Dialect(header_case=tc["header_case"]), ) report = frictionless.validate(resource) - assert report.valid == tc["expected_valid_report"] + assert report.valid == tc["expected_valid_report"], report.task.errors assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[ "expected_flattened_report" ] diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index a3749fdd97..77ee2523a6 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -217,6 +217,7 @@ def __open_header(self): fields=self.schema.fields, row_numbers=self.dialect.header_rows, ignore_case=not self.dialect.header_case, + schema_sync=self.detector.schema_sync, ) # Handle errors @@ -378,12 +379,6 @@ def row_stream(): # Yield row yield row - if self.detector.schema_sync: - # Missing required labels are not included in the - # fields_info parameter used for row creation - for field in self.schema.fields: - self.remove_missing_required_label_from_field_info(field, fields_info) - self.__row_stream = row_stream() def remove_missing_required_label_from_field_info( @@ -417,9 +412,7 @@ def label_is_missing( def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]: """Create a tuple containg all cells from a given row associated to primary keys""" - return tuple( - row[label] for label in self.primary_key_labels(row, case_sensitive) - ) + return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive)) def primary_key_labels( self, @@ -703,11 +696,14 @@ class FieldsInfo: interfaces. """ - def __init__( - self, fields: List[Field], labels: Optional[List[str]], schema_sync: bool - ): + def __init__(self, fields: List[Field], labels: List[str], schema_sync: bool): + self._labels = labels + """Actual labels as found in data""" + + self._expected_fields: List[_FieldInfo] = [] + """Fields that are expected, in the order they are expected""" + if schema_sync and labels: - self._expected_fields: List[_FieldInfo] = [] if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) @@ -737,9 +733,7 @@ def get(self, field_name: str) -> _FieldInfo: ValueError """ try: - return next( - fi for fi in self._expected_fields if fi.field.name == field_name - ) + return next(fi for fi in self._expected_fields if fi.field.name == field_name) except StopIteration: raise ValueError(f"{field_name} is missing from expected fields") diff --git a/frictionless/table/header.py b/frictionless/table/header.py index ed485a1ad5..ac978761fd 100644 --- a/frictionless/table/header.py +++ b/frictionless/table/header.py @@ -1,12 +1,13 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, List +from typing import Iterable, List, Optional, Tuple from .. import errors, helpers +from ..exception import FrictionlessException +from ..schema import Field -if TYPE_CHECKING: - from ..schema import Field +Label = str class Header(List[str]): # type: ignore @@ -29,9 +30,11 @@ def __init__( fields: List[Field], row_numbers: List[int], ignore_case: bool = False, + schema_sync: bool, ): super().__init__(field.name for field in fields) - self.__fields = [field.to_copy() for field in fields] + self.__fields = fields.copy() + self.__schema_sync = schema_sync self.__field_names = self.copy() self.__row_numbers = row_numbers self.__ignore_case = ignore_case @@ -125,47 +128,40 @@ def __process(self): if self.missing: return - # Prepare context - labels = self.__labels - fields = self.__fields + labels = self.labels # Extra label - if len(fields) < len(labels): - start = len(fields) + 1 - iterator = labels[len(fields) :] - for field_number, label in enumerate(iterator, start=start): + start = len(self.fields) + 1 + for field_number, label in enumerate(self._get_extra_labels(), start): + self.__errors.append( + errors.ExtraLabelError( + note="", + labels=list(map(str, labels)), + row_numbers=self.__row_numbers, + label=label, + field_name="", + field_number=field_number, + ) + ) + + # Missing label + start = len(labels) + 1 + for field_number, field in enumerate(self._get_missing_fields(), start=start): + if field is not None: # type: ignore self.__errors.append( - errors.ExtraLabelError( + errors.MissingLabelError( note="", labels=list(map(str, labels)), row_numbers=self.__row_numbers, label="", - field_name="", + field_name=field.name, field_number=field_number, ) ) - # Missing label - if len(fields) > len(labels): - start = len(labels) + 1 - iterator = fields[len(labels) :] - for field_number, field in enumerate(iterator, start=start): - if field is not None: # type: ignore - self.__errors.append( - errors.MissingLabelError( - note="", - labels=list(map(str, labels)), - row_numbers=self.__row_numbers, - label="", - field_name=field.name, - field_number=field_number, - ) - ) - # Iterate items - field_number = 0 - for field, label in zip(fields, labels): - field_number += 1 + for index, (field, label) in enumerate(self._zip_fields_and_labels()): + field_number = index + 1 # Blank label if not label: @@ -228,3 +224,90 @@ def __process(self): row_numbers=self.__row_numbers, ) ] + + def _get_extra_labels(self) -> List[str]: + """Returns unauthorized extra labels. + + If `schema_sync=False`, the labels are expected to be in same order + and of same number than the schema fields. If the number of labels + is longer than the number of fields, the last labels are returned as + extra labels. + + If `schema_sync=True`, extra labels are ignored and this method + returns an empty list + """ + if not self.__schema_sync: + if len(self.fields) < len(self.labels): + return self.labels[len(self.fields) :] + return [] + + def _get_missing_fields(self) -> List[Field]: + """Returns unauthorized missing fields. + + If `schema_sync=False`, the labels are expected to be in same order + and of same number than the schema fields. If the number of fields is + longer than the number of labels, the last fields are returned as + missing fields. + + If `schema_sync=True`, missing fields are ignored, except if they are + marked as `required`. + """ + + fields = self.fields + labels = self.labels + if not self.__schema_sync: + if len(fields) > len(labels): + return fields[len(labels) :] + else: + + def required_and_missing(field: Field) -> bool: + required: bool = field.required or ( + field.schema is not None and field.name in field.schema.primary_key + ) + missing = self._normalize(field.name) not in [ + self._normalize(label) for label in labels + ] + return required and missing + + return [field for field in fields if required_and_missing(field)] + + return [] + + def _zip_fields_and_labels(self) -> Iterable[Tuple[Field, Label]]: + """Returns pairs of (field, label), where the label is expected to + comply with the field expectations. + + If `schema_sync=False`, the labels are expected to be in same order, + so we can associate fields and labels on their position. + + If `schema_sync=True`, label are associated to fields with the same + name. If no such field exists, an extra field with `type: any` is + created on the fly. + """ + if not self.__schema_sync: + return zip(self.fields, self.labels) + else: + zipped: List[Tuple[Field, Label]] = [] + for label in self.labels: + if len(self.labels) != len(set(self.labels)): + note = '"schema_sync" requires unique labels in the header' + raise FrictionlessException(note) + field = self._find_field_by_name(label) + + if not field: + # Default value + field = Field.from_descriptor({"name": label, "type": "any"}) + + zipped.append((field, label)) + return zipped + + def _find_field_by_name(self, name: str) -> Optional[Field]: + try: + return next( + f for f in self.fields if self._normalize(f.name) == self._normalize(name) + ) + except StopIteration: + return None + + def _normalize(self, s: str) -> str: + return s.lower() if self.__ignore_case else s From 9b7d0ad39bf84aa5789ae1185e2c7f44f9174c26 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 16:23:48 +0100 Subject: [PATCH 14/23] =?UTF-8?q?=F0=9F=9F=A2=20get=20rid=20of=20FieldInfo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../formats/inline/__spec__/test_parser.py | 5 +- frictionless/resources/table.py | 7 +- frictionless/table/header.py | 58 +++++---- frictionless/table/row.py | 110 +++++++++--------- 4 files changed, 99 insertions(+), 81 deletions(-) diff --git a/frictionless/formats/inline/__spec__/test_parser.py b/frictionless/formats/inline/__spec__/test_parser.py index 829c695b2a..c813e08dbc 100644 --- a/frictionless/formats/inline/__spec__/test_parser.py +++ b/frictionless/formats/inline/__spec__/test_parser.py @@ -99,7 +99,10 @@ def test_inline_parser_from_ordered_dict(): def test_inline_parser_read_inline_data_with_missing_column_values(): data = [{"id": 1}, {"id": 2, "name": "中国人"}] schema = { - "fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}] + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "name", "type": "string"}, + ] } with TableResource(data=data, schema=Schema.from_descriptor(schema)) as resource: assert resource.read_rows() == [ diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 77ee2523a6..3f2db52fb8 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -264,10 +264,6 @@ def __open_lookup(self): self.__lookup[source_name][source_key].add(cells) def __open_row_stream(self): - fields_info = FieldsInfo( - self.schema.fields, self.labels, self.detector.schema_sync - ) - # Create state memory_unique: Dict[str, Any] = {} memory_primary: Dict[Tuple[Any], Any] = {} @@ -297,9 +293,10 @@ def row_stream(): for row_number, cells in enumerated_content_stream: self.stats.rows += 1 + assert self.__header row = Row( cells, - fields_info=fields_info, + expected_fields=self.__header.get_expected_fields(), row_number=row_number, ) diff --git a/frictionless/table/header.py b/frictionless/table/header.py index ac978761fd..2d60ac0827 100644 --- a/frictionless/table/header.py +++ b/frictionless/table/header.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import cached_property -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional from .. import errors, helpers from ..exception import FrictionlessException @@ -34,6 +34,7 @@ def __init__( ): super().__init__(field.name for field in fields) self.__fields = fields.copy() + self.__expected_fields: Optional[List[Field]] = None self.__schema_sync = schema_sync self.__field_names = self.copy() self.__row_numbers = row_numbers @@ -132,7 +133,7 @@ def __process(self): # Extra label start = len(self.fields) + 1 - for field_number, label in enumerate(self._get_extra_labels(), start): + for field_number, label in enumerate(self._get_extra_labels(), start=start): self.__errors.append( errors.ExtraLabelError( note="", @@ -160,7 +161,8 @@ def __process(self): ) # Iterate items - for index, (field, label) in enumerate(self._zip_fields_and_labels()): + for index, (field, label) in enumerate(zip(self.get_expected_fields(), labels)): + # field_number is 1-indexed field_number = index + 1 # Blank label @@ -226,7 +228,7 @@ def __process(self): ] def _get_extra_labels(self) -> List[str]: - """Returns unauthorized extra labels. + """Returns unexpected extra labels. If `schema_sync=False`, the labels are expected to be in same order and of same number than the schema fields. If the number of labels @@ -242,7 +244,7 @@ def _get_extra_labels(self) -> List[str]: return [] def _get_missing_fields(self) -> List[Field]: - """Returns unauthorized missing fields. + """Returns unexpected missing fields. If `schema_sync=False`, the labels are expected to be in same order and of same number than the schema fields. If the number of fields is @@ -273,33 +275,43 @@ def required_and_missing(field: Field) -> bool: return [] - def _zip_fields_and_labels(self) -> Iterable[Tuple[Field, Label]]: - """Returns pairs of (field, label), where the label is expected to - comply with the field expectations. + def get_expected_fields(self) -> List[Field]: + """Returns a list of fields, in the order they are expected to be + found in the data. - If `schema_sync=False`, the labels are expected to be in same order, - so we can associate fields and labels on their position. + The label with same position, and its associated + data are expected to comply with the field's expectations. - If `schema_sync=True`, label are associated to fields with the same - name. If no such field exists, an extra field with `type: any` is + If `schema_sync=False`, the schema fields are precisely the expected + fields, so they are returned unchanged. + + If `schema_sync=True`, fields are reordered so as the field names to + match the labels. If no such field exists, an extra field with `type: any` is created on the fly. + + The result is cached as a property. """ - if not self.__schema_sync: - return zip(self.fields, self.labels) - else: - zipped: List[Tuple[Field, Label]] = [] - for label in self.labels: + if not self.__expected_fields: + if not self.__schema_sync: + self.__expected_fields = self.fields + else: + expected_fields: List[Field] = [] + if len(self.labels) != len(set(self.labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - field = self._find_field_by_name(label) - if not field: - # Default value - field = Field.from_descriptor({"name": label, "type": "any"}) + for label in self.labels: + field = self._find_field_by_name(label) + + if not field: + # Default value + field = Field.from_descriptor({"name": label, "type": "any"}) + + expected_fields.append(field) + self.__expected_fields = expected_fields - zipped.append((field, label)) - return zipped + return self.__expected_fields def _find_field_by_name(self, name: str) -> Optional[Field]: try: diff --git a/frictionless/table/row.py b/frictionless/table/row.py index b817b51759..2391d5d3cc 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import OrderedDict from functools import cached_property from itertools import zip_longest from typing import TYPE_CHECKING, Any, Dict, List, Optional @@ -12,7 +13,7 @@ # We can consider adding row.header property to provide more comprehensive API if TYPE_CHECKING: - from ..resources.table import FieldsInfo + from ..schema.field import Field # TODO: add types @@ -21,6 +22,9 @@ class Row(Dict[str, Any]): > Constructor of this object is not Public API + It works like a lazy dictionary: dictionary values are only computed (see + `__process` method) if needed. + This object is returned by `extract`, `resource.read_rows`, and other functions. ```python @@ -39,11 +43,13 @@ def __init__( self, cells: List[Any], *, - fields_info: FieldsInfo, + expected_fields: List[Field], row_number: int, ): self.__cells = cells - self.__fields_info = fields_info + self.__expected_fields: OrderedDict[str, Field] = OrderedDict( + (f.name, f) for f in expected_fields + ) self.__row_number = row_number self.__processed: bool = False self.__blank_cells: Dict[str, Any] = {} @@ -55,18 +61,18 @@ def __eq__(self, other: object): return super().__eq__(other) def __str__(self): - self.__process() return super().__str__() def __repr__(self): - self.__process() return super().__repr__() def __setitem__(self, key: str, value: Any): try: - field_number = self.__fields_info.get(key).field_number + keys = [k for k in self.__expected_fields.keys()] + field_number = keys.index(key) + 1 except ValueError: raise KeyError(f"Row does not have a field {key}") + if len(self.__cells) < field_number: self.__cells.extend([None] * (field_number - len(self.__cells))) self.__cells[field_number - 1] = value @@ -76,30 +82,33 @@ def __missing__(self, key: str): return self.__process(key) def __iter__(self): - return iter(self.__fields_info.ls()) + return iter(self.__expected_fields) def __len__(self): - return len(self.__fields_info.ls()) + return len(self.__expected_fields) def __contains__(self, key: object): - return key in self.__fields_info.ls() + return key in self.__expected_fields def __reversed__(self): - return reversed(self.__fields_info.ls()) + return reversed(self.__expected_fields.keys()) - def keys(self): # type: ignore - return iter(self.__fields_info.ls()) + def keys(self): + return self.__expected_fields.keys() def values(self): # type: ignore - for name in self.__fields_info.ls(): + self.__process() + for name in self.__expected_fields: yield self[name] def items(self): # type: ignore - for name in self.__fields_info.ls(): + self.__process() + for name in self.__expected_fields: yield (name, self[name]) def get(self, key: str, default: Optional[Any] = None): - if key not in self.__fields_info.ls(): + self.__process() + if key not in self.__expected_fields: return default return self[key] @@ -109,6 +118,7 @@ def cells(self): .ls(): Field[]: table schema fields """ + self.__process() return self.__cells @cached_property @@ -117,7 +127,7 @@ def fields(self): Returns: Field[]: table schema fields """ - return self.__fields_info.get_copies() + return [f.to_copy() for f in self.__expected_fields.values()] @cached_property def field_names(self) -> List[str]: @@ -125,7 +135,7 @@ def field_names(self) -> List[str]: Returns: str[]: field names """ - return self.__fields_info.ls() + return [k for k in self.__expected_fields] @cached_property def field_numbers(self): @@ -133,7 +143,7 @@ def field_numbers(self): Returns: str[]: field numbers """ - return list(range(1, len(self.__fields_info.ls()) + 1)) + return list(range(1, len(self.__expected_fields) + 1)) @cached_property def row_number(self) -> int: @@ -204,17 +214,16 @@ def to_list(self, *, json: bool = False, types: Optional[List[str]] = None): # Prepare self.__process() - result = [self[name] for name in self.__fields_info.ls()] + result = [self[name] for name in self.field_names] if types is None and json: types = platform.frictionless_formats.JsonParser.supported_types # Convert if types is not None: - field_names = self.__fields_info.ls() + field_names = self.field_names for index, field_name in enumerate(field_names): - field_info = self.__fields_info.get(field_name) - field = field_info.field - cell_writer = field_info.cell_writer + field = self.__expected_fields[field_name] + cell_writer = field.create_cell_writer() # Here we can optimize performance if we use a types mapping if field.type in types: @@ -246,7 +255,7 @@ def to_dict( # Prepare self.__process() - result = {name: self[name] for name in self.__fields_info.ls()} + result = {name: self[name] for name in self.__expected_fields} if types is None and json: types = platform.frictionless_formats.JsonParser.supported_types if types is None and csv: @@ -254,11 +263,10 @@ def to_dict( # Convert if types is not None: - field_names = self.__fields_info.ls() + field_names = self.field_names for field_name in field_names: - field_info = self.__fields_info.get(field_name) - field = field_info.field - cell_writer = field_info.cell_writer + field = self.__expected_fields[field_name] + cell_writer = field.create_cell_writer() # Here we can optimize performance if we use a types mapping if field.type not in types: @@ -283,50 +291,48 @@ def __process(self, key: Optional[str] = None): # Prepare context cells = self.__cells to_str = lambda v: str(v) if v is not None else "" # type: ignore - fields = self.__fields_info.get_copies() - names = self.__fields_info.ls() - field_infos = [self.__fields_info.get(name) for name in names] - iterator = zip_longest(field_infos, cells) + fields = [f.to_copy() for f in self.__expected_fields.values()] + + iterator = zip_longest(range(len(fields)), self.__expected_fields.values(), cells) is_empty = not bool(super().__len__()) if key: try: - field_info = self.__fields_info.get(key) - field_number = field_info.field_number + field = self.__expected_fields.get(key) + field_index = self.field_names.index(key) except ValueError: raise KeyError(f"Row does not have a field {key}") - cell = cells[field_number - 1] if len(cells) >= field_number else None - iterator = zip([field_info], [cell]) + + cell = cells[field_index] if len(cells) >= field_index + 1 else None + iterator = [(field_index, field, cell)] # Iterate cells - for field_info, source in iterator: + for index, field, cell in iterator: # Prepare context - if field_info is None: + if field is None: break - field = field_info.field - field_number = field_info.field_number - cell_reader = field_info.cell_reader + cell_reader = field.create_cell_reader() if not is_empty and super().__contains__(field.name): continue # Read cell - target, notes = cell_reader(source) + target, notes = cell_reader(cell) type_note = notes.pop("type", None) if notes else None if target is None and not type_note: - self.__blank_cells[field.name] = source + self.__blank_cells[field.name] = cell # Type error if type_note: - self.__error_cells[field.name] = source + self.__error_cells[field.name] = cell self.__errors.append( errors.TypeError( note=type_note, cells=list(map(to_str, cells)), # type: ignore row_number=self.__row_number, - cell=str(source), + cell=str(cell), field_name=field.name, - field_number=field_number, + field_number=index + 1, ) ) @@ -338,9 +344,9 @@ def __process(self, key: Optional[str] = None): note=note, cells=list(map(to_str, cells)), # type: ignore row_number=self.__row_number, - cell=str(source), + cell=str(cell), field_name=field.name, - field_number=field_number, + field_number=index + 1, ) ) @@ -353,7 +359,7 @@ def __process(self, key: Optional[str] = None): if len(fields) < len(cells): start = len(fields) + 1 iterator = cells[len(fields) :] - for field_number, cell in enumerate(iterator, start=start): + for field_index, cell in enumerate(iterator, start=start): self.__errors.append( errors.ExtraCellError( note="", @@ -361,7 +367,7 @@ def __process(self, key: Optional[str] = None): row_number=self.__row_number, cell=str(cell), field_name="", - field_number=field_number, + field_number=field_index, ) ) @@ -369,7 +375,7 @@ def __process(self, key: Optional[str] = None): if len(fields) > len(cells): start = len(cells) + 1 iterator = fields[len(cells) :] - for field_number, field in enumerate(iterator, start=start): + for field_index, field in enumerate(iterator, start=start): if field is not None: self.__errors.append( errors.MissingCellError( @@ -378,7 +384,7 @@ def __process(self, key: Optional[str] = None): row_number=self.__row_number, cell="", field_name=field.name, - field_number=field_number, + field_number=field_index, ) ) From a80e79a19e3c0e7fa249c8904947eae83b05bc74 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 16:39:22 +0100 Subject: [PATCH 15/23] remove unnecessary review noise --- frictionless/formats/inline/__spec__/test_parser.py | 5 +---- frictionless/resource/__spec__/test_validate_schema.py | 2 +- frictionless/table/header.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/frictionless/formats/inline/__spec__/test_parser.py b/frictionless/formats/inline/__spec__/test_parser.py index c813e08dbc..829c695b2a 100644 --- a/frictionless/formats/inline/__spec__/test_parser.py +++ b/frictionless/formats/inline/__spec__/test_parser.py @@ -99,10 +99,7 @@ def test_inline_parser_from_ordered_dict(): def test_inline_parser_read_inline_data_with_missing_column_values(): data = [{"id": 1}, {"id": 2, "name": "中国人"}] schema = { - "fields": [ - {"name": "id", "type": "integer"}, - {"name": "name", "type": "string"}, - ] + "fields": [{"name": "id", "type": "integer"}, {"name": "name", "type": "string"}] } with TableResource(data=data, schema=Schema.from_descriptor(schema)) as resource: assert resource.read_rows() == [ diff --git a/frictionless/resource/__spec__/test_validate_schema.py b/frictionless/resource/__spec__/test_validate_schema.py index 7dc01953c8..b386f05c3b 100644 --- a/frictionless/resource/__spec__/test_validate_schema.py +++ b/frictionless/resource/__spec__/test_validate_schema.py @@ -435,7 +435,7 @@ def test_validate_resource_ignoring_header_case_issue_1635(): dialect=Dialect(header_case=tc["header_case"]), ) report = frictionless.validate(resource) - assert report.valid == tc["expected_valid_report"], report.task.errors + assert report.valid == tc["expected_valid_report"] assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[ "expected_flattened_report" ] diff --git a/frictionless/table/header.py b/frictionless/table/header.py index 2d60ac0827..2a7727d8f1 100644 --- a/frictionless/table/header.py +++ b/frictionless/table/header.py @@ -33,7 +33,7 @@ def __init__( schema_sync: bool, ): super().__init__(field.name for field in fields) - self.__fields = fields.copy() + self.__fields = [field.to_copy() for field in fields] self.__expected_fields: Optional[List[Field]] = None self.__schema_sync = schema_sync self.__field_names = self.copy() From b7f693d50a231e6aa55d8d82478935710719147e Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 16:49:26 +0100 Subject: [PATCH 16/23] remove unused function --- frictionless/resources/table.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 3f2db52fb8..b1fe898cb5 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -378,15 +378,6 @@ def row_stream(): self.__row_stream = row_stream() - def remove_missing_required_label_from_field_info( - self, field: Field, fields_info: FieldsInfo - ): - is_case_sensitive = self.dialect.header_case - if self.label_is_missing( - field.name, fields_info.ls(), self.labels, is_case_sensitive - ): - fields_info.rm(field.name) - @staticmethod def label_is_missing( field_name: str, From 77a2c9f693d3da90ac6a8df90926a34f40496b11 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 16:50:05 +0100 Subject: [PATCH 17/23] remove unused FieldInfo --- frictionless/resources/table.py | 75 --------------------------------- 1 file changed, 75 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index b1fe898cb5..4e9580d4fb 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -660,78 +660,3 @@ def write( self, target: Optional[Union[Resource, Any]] = None, **options: Any ) -> TableResource: return self.write_table(target, **options) - - -class _FieldInfo: - """Private class to store additional data alongside a field.""" - - def __init__(self, field: Field, field_number: int): - """field_number: 1-indexed rank of the field""" - self.field = field - self.field_number = field_number - self.cell_reader = field.create_cell_reader() - self.cell_writer = field.create_cell_writer() - - -class FieldsInfo: - """Helper class for linking columns to schema fields. - - It abstracts away the different ways of making this link. In particular, the - reference may be the schema (`detector.schema_sync = False`), or the labels - (`detector.schema_sync = True`). - - This class is not Public API, and should be used only in non-public - interfaces. - """ - - def __init__(self, fields: List[Field], labels: List[str], schema_sync: bool): - self._labels = labels - """Actual labels as found in data""" - - self._expected_fields: List[_FieldInfo] = [] - """Fields that are expected, in the order they are expected""" - - if schema_sync and labels: - if len(labels) != len(set(labels)): - note = '"schema_sync" requires unique labels in the header' - raise FrictionlessException(note) - - for label_index, label in enumerate(labels): - try: - field = next(f for f in fields if f.name == label) - except StopIteration: - field = Field.from_descriptor({"name": label, "type": "any"}) - self._expected_fields.append(_FieldInfo(field, label_index + 1)) - else: - self._expected_fields = [ - _FieldInfo(field, i + 1) for i, field in enumerate(fields) - ] - - def ls(self) -> List[str]: - """List all column names""" - return [fi.field.name for fi in self._expected_fields] - - def get(self, field_name: str) -> _FieldInfo: - """Get a Field by its name - - In case no field with field_name exists, the behavior depends on - the `detector.schema_sync` option: - - Raises: - ValueError - """ - try: - return next(fi for fi in self._expected_fields if fi.field.name == field_name) - except StopIteration: - raise ValueError(f"{field_name} is missing from expected fields") - - def get_copies(self) -> List[Field]: - """Return field copies""" - return [fi.field.to_copy() for fi in self._expected_fields] - - def rm(self, field_name: str): - try: - i = self.ls().index(field_name) - del self._expected_fields[i] - except ValueError: - raise ValueError(f"'{field_name}' is not in fields data") From 305d2b9e57c8584afc7595e2ad71e8aa40ccbff6 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 16:57:41 +0100 Subject: [PATCH 18/23] linting --- frictionless/resources/table.py | 2 -- frictionless/table/header.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 4e9580d4fb..0bc1a2f254 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -5,8 +5,6 @@ import warnings from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from frictionless.schema.field import Field - from .. import errors, helpers from ..analyzer import Analyzer from ..dialect import Dialect diff --git a/frictionless/table/header.py b/frictionless/table/header.py index 2a7727d8f1..3af70338e7 100644 --- a/frictionless/table/header.py +++ b/frictionless/table/header.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import cached_property -from typing import Iterable, List, Optional +from typing import List, Optional from .. import errors, helpers from ..exception import FrictionlessException From 91d3ffb6cc53e175056e37afb8b4dd9f74710f9d Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 16:59:52 +0100 Subject: [PATCH 19/23] Information on processing for Row.__str__ and Row.__repr__ --- frictionless/table/row.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/frictionless/table/row.py b/frictionless/table/row.py index 2391d5d3cc..440edb3058 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -61,10 +61,16 @@ def __eq__(self, other: object): return super().__eq__(other) def __str__(self): - return super().__str__() + s = "" + if not self.__processed: + s = "Unprocessed: " + return s + super().__str__() def __repr__(self): - return super().__repr__() + s = "" + if not self.__processed: + s = "Unprocessed: " + return s + "Row" + super().__repr__() def __setitem__(self, key: str, value: Any): try: From 575885bcd7381ee562a2dbd2db2a1cf38db4964c Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 17:07:48 +0100 Subject: [PATCH 20/23] typo --- frictionless/table/row.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frictionless/table/row.py b/frictionless/table/row.py index 440edb3058..c0046227f3 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -121,8 +121,8 @@ def get(self, key: str, default: Optional[Any] = None): @cached_property def cells(self): """ - .ls(): - Field[]: table schema fields + Returns: + Any[]: Table cell values """ self.__process() return self.__cells From c41590fd0603a85bea68f08cb0fce745906e08ea Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 17:11:43 +0100 Subject: [PATCH 21/23] unintended rename --- frictionless/table/row.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frictionless/table/row.py b/frictionless/table/row.py index c0046227f3..e610763f22 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -365,7 +365,7 @@ def __process(self, key: Optional[str] = None): if len(fields) < len(cells): start = len(fields) + 1 iterator = cells[len(fields) :] - for field_index, cell in enumerate(iterator, start=start): + for field_number, cell in enumerate(iterator, start=start): self.__errors.append( errors.ExtraCellError( note="", @@ -373,7 +373,7 @@ def __process(self, key: Optional[str] = None): row_number=self.__row_number, cell=str(cell), field_name="", - field_number=field_index, + field_number=field_number, ) ) @@ -381,7 +381,7 @@ def __process(self, key: Optional[str] = None): if len(fields) > len(cells): start = len(cells) + 1 iterator = fields[len(cells) :] - for field_index, field in enumerate(iterator, start=start): + for field_number, field in enumerate(iterator, start=start): if field is not None: self.__errors.append( errors.MissingCellError( @@ -390,7 +390,7 @@ def __process(self, key: Optional[str] = None): row_number=self.__row_number, cell="", field_name=field.name, - field_number=field_index, + field_number=field_number, ) ) From 37fd7097667a3509ffb03e5a7781e607b06b244f Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 17:23:04 +0100 Subject: [PATCH 22/23] Remove __repr__ change as it is used for tests --- frictionless/table/row.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/table/row.py b/frictionless/table/row.py index e610763f22..80de14bbc5 100644 --- a/frictionless/table/row.py +++ b/frictionless/table/row.py @@ -70,7 +70,7 @@ def __repr__(self): s = "" if not self.__processed: s = "Unprocessed: " - return s + "Row" + super().__repr__() + return s + super().__repr__() def __setitem__(self, key: str, value: Any): try: From c6c90a2141e10abd71dfe6fb1bf29dfbddcd32dd Mon Sep 17 00:00:00 2001 From: Pierre Camilleri Date: Fri, 7 Feb 2025 17:28:37 +0100 Subject: [PATCH 23/23] fix: oopsie --- frictionless/table/header.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/table/header.py b/frictionless/table/header.py index 3af70338e7..43c64d3d26 100644 --- a/frictionless/table/header.py +++ b/frictionless/table/header.py @@ -33,7 +33,7 @@ def __init__( schema_sync: bool, ): super().__init__(field.name for field in fields) - self.__fields = [field.to_copy() for field in fields] + self.__fields = fields.copy() self.__expected_fields: Optional[List[Field]] = None self.__schema_sync = schema_sync self.__field_names = self.copy()