Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 0 additions & 81 deletions frictionless/detector/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from .. import helpers, settings
from ..dialect import Dialect
from ..exception import FrictionlessException
from ..fields import AnyField
from ..metadata import Metadata
from ..platform import platform
Expand Down Expand Up @@ -411,33 +410,6 @@ def detect_schema(
fields[index] = AnyField(name=name, schema=schema) # type: ignore
schema.fields = fields # type: ignore

# Sync schema
if self.schema_sync:
if labels:
case_sensitive = options["header_case"]

if not case_sensitive:
labels = [label.lower() for label in labels]

if len(labels) != len(set(labels)):
note = '"schema_sync" requires unique labels in the header'
raise FrictionlessException(note)

mapped_fields = self.mapped_schema_fields_names(
schema.fields, # type: ignore
case_sensitive,
)

self.rearrange_schema_fields_given_labels(
mapped_fields,
schema,
labels,
)

self.add_missing_required_labels_to_schema_fields(
mapped_fields, schema, labels, case_sensitive
)

# Patch schema
if self.schema_patch:
patch = deepcopy(self.schema_patch)
Expand All @@ -452,56 +424,3 @@ def detect_schema(

return schema

@staticmethod
def mapped_schema_fields_names(
fields: List[Field], case_sensitive: bool
) -> Dict[str, Field]:
"""Create a dictionnary to map field names with schema fields"""
if case_sensitive:
return {field.name: field for field in fields}
else:
return {field.name.lower(): field for field in fields}

@staticmethod
def rearrange_schema_fields_given_labels(
fields_mapping: Dict[str, Field],
schema: Schema,
labels: List[str],
):
"""Rearrange fields according to the order of labels. All fields
missing from labels are dropped"""
schema.clear_fields()

for name in labels:
default_field = Field.from_descriptor({"name": name, "type": "any"})
field = fields_mapping.get(name, default_field)
schema.add_field(field)

def add_missing_required_labels_to_schema_fields(
self,
fields_mapping: Dict[str, Field],
schema: Schema,
labels: List[str],
case_sensitive: bool,
):
"""This method aims to add missing required labels and
primary key field not in labels to schema fields.
"""
for name, field in fields_mapping.items():
if (
self.field_is_required(field, schema, case_sensitive)
and name not in labels
):
schema.add_field(field)

@staticmethod
def field_is_required(
field: Field,
schema: Schema,
case_sensitive: bool,
) -> bool:
if case_sensitive:
return field.required or field.name in schema.primary_key
else:
lower_primary_key = [pk.lower() for pk in schema.primary_key]
return field.required or field.name.lower() in lower_primary_key
4 changes: 3 additions & 1 deletion frictionless/resource/__spec__/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,10 +509,12 @@ def test_resource_validate_detector_sync_schema():
)
report = resource.validate()
assert report.valid
# schema_sync no longer mutates the user-provided schema: the order
# given by the user is preserved.
assert resource.schema.to_descriptor() == {
"fields": [
{"name": "name", "type": "string"},
{"name": "id", "type": "integer"},
{"name": "name", "type": "string"},
],
}

Expand Down
65 changes: 5 additions & 60 deletions frictionless/resources/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def __open_header(self):
fields=self.schema.fields,
row_numbers=self.dialect.header_rows,
ignore_case=not self.dialect.header_case,
schema_sync=self.detector.schema_sync,
)

# Handle errors
Expand Down Expand Up @@ -270,24 +271,9 @@ def __open_lookup(self):
self.__lookup[source_name][source_key].add(cells)

def __open_row_stream(self):
# TODO: we need to rework this field_info / row code
# During row streaming we create a field info structure
# This structure is optimized and detached version of schema.fields
# We create all data structures in-advance to share them between rows

# Create field info
field_number = 0
field_info: Dict[str, Any] = {"names": [], "objects": [], "mapping": {}}
for field in self.schema.fields:
field_number += 1
field_info["names"].append(field.name)
field_info["objects"].append(field.to_copy())
field_info["mapping"][field.name] = (
field,
field_number,
field.create_cell_reader(),
field.create_cell_writer(),
)
# The header knows the fields to expect in the data (in order, and
# accounting for schema_sync rules).
expected_fields: List[Field] = self.header.get_expected_fields()

# Create state
memory_unique: Dict[str, Any] = {}
Expand Down Expand Up @@ -320,7 +306,7 @@ def row_stream():

row = Row(
cells,
field_info=field_info,
fields=expected_fields,
row_number=row_number,
)

Expand Down Expand Up @@ -400,50 +386,9 @@ def row_stream():
# Yield row
yield row

if self.detector.schema_sync:
# Missing required labels are not included in the
# field_info parameter used for row creation
for field in self.schema.fields:
self.remove_missing_required_label_from_field_info(field, field_info)

# Create row stream
self.__row_stream = row_stream()

def remove_missing_required_label_from_field_info(
self, field: Field, field_info: Dict[str, Any]
):
is_case_sensitive = self.dialect.header_case
if self.label_is_missing(
field.name, field_info["names"], self.labels, is_case_sensitive
):
self.remove_field_from_field_info(field.name, field_info)

@staticmethod
def label_is_missing(
field_name: str,
expected_field_names: List[str],
table_labels: types.ILabels,
case_sensitive: bool,
) -> bool:
"""Check if a schema field name is missing from the TableResource
labels.
"""
if not case_sensitive:
field_name = field_name.lower()
table_labels = [label.lower() for label in table_labels]
expected_field_names = [
field_name.lower() for field_name in expected_field_names
]

return field_name not in table_labels and field_name in expected_field_names

@staticmethod
def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]):
field_index = field_info["names"].index(field_name)
del field_info["names"][field_index]
del field_info["objects"][field_index]
del field_info["mapping"][field_name]

def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]:
"""Create a tuple containg all cells from a given row associated to primary
keys"""
Expand Down
65 changes: 65 additions & 0 deletions frictionless/table/__spec__/test_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import frictionless
from frictionless import Schema, fields
from frictionless.resources import TableResource
from frictionless.table.header import Header

# General

Expand Down Expand Up @@ -42,6 +43,70 @@ def test_missing_label():
assert header.valid is False


# get_expected_fields


def _make_header(labels, field_names, *, schema_sync=False, ignore_case=False):
return Header(
labels,
fields=[fields.AnyField(name=name) for name in field_names],
row_numbers=[1],
ignore_case=ignore_case,
schema_sync=schema_sync,
)


@pytest.mark.parametrize(
"labels, field_names, schema_sync, ignore_case, expected_names",
[
pytest.param(
["a", "b"], ["a", "b"], False, False, ["a", "b"],
id="no-sync: schema fields are returned as-is",
),
pytest.param(
["b", "a"], ["a", "b"], False, False, ["a", "b"],
id="no-sync: schema order is kept even if labels differ",
),
pytest.param(
["b", "a"], ["a", "b"], True, False, ["b", "a"],
id="sync: fields are reordered to match labels",
),
pytest.param(
["a", "extra"], ["a"], True, False, ["a", "extra"],
id="sync: extra labels get a default any-typed field",
),
pytest.param(
["a"], ["a", "b"], True, False, ["a"],
id="sync: fields absent from labels are dropped",
),
pytest.param(
["B", "A"], ["a", "b"], True, True, ["b", "a"],
id="sync + ignore_case: matching is case-insensitive",
),
],
)
def test_get_expected_fields(
labels, field_names, schema_sync, ignore_case, expected_names
):
header = _make_header(
labels, field_names, schema_sync=schema_sync, ignore_case=ignore_case
)
actual = [f.name for f in header.get_expected_fields()]
assert actual == expected_names


def test_get_expected_fields_sync_default_field_is_any_typed():
header = _make_header(["a", "extra"], ["a"], schema_sync=True)
expected = header.get_expected_fields()
assert expected[1].type == "any"


def test_get_expected_fields_sync_raises_on_duplicate_labels():
header = _make_header(["a", "a"], ["a"], schema_sync=True)
with pytest.raises(frictionless.FrictionlessException):
header.get_expected_fields()


@pytest.mark.parametrize(
"source, required, valid_report, nb_errors, types_errors_expected, header_case",
[
Expand Down
17 changes: 17 additions & 0 deletions frictionless/table/__spec__/test_row.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json
from decimal import Decimal

from frictionless import fields
from frictionless.resources import TableResource
from frictionless.table.row import Row

# General

Expand All @@ -19,6 +21,21 @@ def test_basic():
assert row.to_dict() == {"field1": 1, "field2": 2, "field3": 3}


def test_row_can_be_built_from_fields_list():
row = Row(
["1", "2"],
fields=[fields.IntegerField(name="a"), fields.IntegerField(name="b")],
row_number=2,
)
assert row == {"a": 1, "b": 2}
assert row.field_names == ["a", "b"]
assert row.field_numbers == [1, 2]
assert row.row_number == 2
assert row.errors == []
assert row.to_list() == [1, 2]
assert row.to_dict() == {"a": 1, "b": 2}


# Convert


Expand Down
Loading
Loading