From 7b332d92c1a05c48de3e665dd7a779b71aad0fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Fri, 22 May 2026 12:31:33 -0300 Subject: [PATCH] docs: include docstrings with rst format to autogenerate in RDT --- docs/source/conf.py | 1 + pysus/api/_impl/databases.py | 272 ++++++++++++++++++++++++- pysus/api/client.py | 99 ++++++++- pysus/api/dadosgov/client.py | 149 ++++++++++++-- pysus/api/dadosgov/databases.py | 261 ++++++++++++++++++++++-- pysus/api/dadosgov/models.py | 135 +++++++++++-- pysus/api/ducklake/catalog.py | 92 ++++++++- pysus/api/ducklake/client.py | 188 ++++++++++++++++-- pysus/api/ducklake/models.py | 150 ++++++++++++-- pysus/api/extensions.py | 68 ++++++- pysus/api/ftp/client.py | 74 ++++++- pysus/api/ftp/databases.py | 342 ++++++++++++++++++++++++++++---- pysus/api/ftp/models.py | 213 +++++++++++++++++--- pysus/api/models.py | 43 +++- pysus/api/utils.py | 13 ++ 15 files changed, 1913 insertions(+), 187 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7c1af8b..9b2510c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,6 +12,7 @@ "sphinx.ext.mathjax", "sphinx.ext.viewcode", "sphinx.ext.intersphinx", + "sphinx.ext.napoleon", "nbsphinx", ] diff --git a/pysus/api/_impl/databases.py b/pysus/api/_impl/databases.py index 684edce..be44cd3 100644 --- a/pysus/api/_impl/databases.py +++ b/pysus/api/_impl/databases.py @@ -38,9 +38,45 @@ def _fetch_data( show_progress: bool = True, **kwargs, ) -> pd.DataFrame: - """Query, download, and concatenate Parquet files for a given dataset.""" + """Query, download, and concatenate Parquet files for a given dataset. + + Internally creates an async event loop, queries the PySUS API for matching + files, downloads them, and reads them into a single DataFrame. + + Parameters + ---------- + dataset : str + Name of the dataset (e.g. ``"sinan"``, ``"sinasc"``). + group : str, optional + Group or disease code to filter by. + state : str, optional + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int], optional + Year or list of years to fetch. + month : int | list[int], optional + Month or list of months to fetch. + show_progress : bool, optional + Whether to display a tqdm progress bar during download. Default is + ``True``. + **kwargs + Additional arguments forwarded to :meth:`PySUS.read_parquet`. + + Returns + ------- + pd.DataFrame + Concatenated data from all matching Parquet files. Returns an empty + DataFrame when no files are found. + + Raises + ------ + RuntimeError + If an event loop is already running but ``nest_asyncio`` is not + installed. + """ async def _fetch(): + """Coroutine that performs the actual API query, download, and read.""" + async with PySUS() as pysus: years = [year] if isinstance(year, int) else (year or [None]) months = [month] if isinstance(month, int) else (month or [None]) @@ -157,7 +193,25 @@ def sinan( year: int | list[int], **kwargs, ) -> pd.DataFrame: - """Fetch SINAN records for a given disease and year(s).""" + """Fetch SINAN records for a given disease and year(s). + + SINAN (Sistema de Informação de Agravos de Notificação) is the Brazilian + notifiable-disease information system. + + Parameters + ---------- + disease : Literal + Disease code (e.g. ``"DENG"`` for dengue, ``"ZIKA"`` for zika). + year : int | list[int] + Year or list of years to fetch. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + SINAN records for the specified disease and year(s). + """ return _fetch_data( dataset="sinan", group=disease.upper(), @@ -171,7 +225,27 @@ def sinasc( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch SINASC birth certificates for a given state, year(s), and group.""" + """Fetch SINASC birth certificates for a given state, year(s), and group. + + SINASC (Sistema de Informação sobre Nascidos Vivos) is the Brazilian live + birth information system. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + SINASC birth records for the specified state, year(s), and group. + """ return _fetch_data( dataset="sinasc", state=state.upper(), @@ -186,7 +260,27 @@ def sim( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch SIM mortality records for a given state, year(s), and group.""" + """Fetch SIM mortality records for a given state, year(s), and group. + + SIM (Sistema de Informação sobre Mortalidade) is the Brazilian mortality + information system. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + SIM mortality records for the specified state, year(s), and group. + """ return _fetch_data( dataset="sim", state=state.upper(), @@ -202,7 +296,29 @@ def sih( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch SIH hospital admissions for a state, year, month, and group.""" + """Fetch SIH hospital admissions for a state, year, month, and group. + + SIH (Sistema de Informação Hospitalar) is the Brazilian hospital + admission information system. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + month : int | list[int] + Month or list of months to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + SIH hospital admission records. + """ return _fetch_data( dataset="sih", state=state.upper(), @@ -219,7 +335,29 @@ def sia( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch SIA ambulatory care for a state, year, month, and group.""" + """Fetch SIA ambulatory care for a state, year, month, and group. + + SIA (Sistema de Informação Ambulatorial) is the Brazilian ambulatory care + information system. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + month : int | list[int] + Month or list of months to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + SIA ambulatory care records. + """ return _fetch_data( dataset="sia", state=state.upper(), @@ -235,7 +373,27 @@ def pni( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch PNI immunisation records for a given state, year(s), and group.""" + """Fetch PNI immunisation records for a given state, year(s), and group. + + PNI (Programa Nacional de Imunizações) is the Brazilian national + immunisation programme. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + PNI immunisation records. + """ return _fetch_data( dataset="pni", state=state.upper(), @@ -249,7 +407,25 @@ def ibge( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch IBGE census data for given year(s) and optional group.""" + """Fetch IBGE census data for given year(s) and optional group. + + IBGE (Instituto Brasileiro de Geografia e Estatística) provides census + and demographic data. + + Parameters + ---------- + year : int | list[int] + Year or list of years to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + IBGE census data for the specified year(s) and group. + """ return _fetch_data(dataset="ibge", group=group, year=year) @@ -260,7 +436,29 @@ def cnes( group: str | None = None, **kwargs, ) -> pd.DataFrame: - """Fetch CNES health facilities for a state, year, month, and group.""" + """Fetch CNES health facilities for a state, year, month, and group. + + CNES (Cadastro Nacional de Estabelecimentos de Saúde) is the Brazilian + registry of health-care facilities. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + month : int | list[int] + Month or list of months to fetch. + group : str, optional + Additional grouping code. + **kwargs + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + CNES health-facility records. + """ return _fetch_data( dataset="cnes", state=state.upper(), @@ -277,7 +475,29 @@ def ciha( group: str | None = "CIHA", **kwargs, ) -> pd.DataFrame: - """Fetch CIHA hospitalisation records for state, year, month, and group.""" + """Fetch CIHA hospitalisation records for state, year, month, and group. + + CIHA (Comunicação de Internação Hospitalar) provides hospitalisation + records. + + Parameters + ---------- + state : State + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int] + Year or list of years to fetch. + month : int | list[int] + Month or list of months to fetch. + group : str, optional + Additional grouping code. Default is ``"CIHA"``. + ``**kwargs`` + Additional arguments forwarded to :func:`_fetch_data`. + + Returns + ------- + pd.DataFrame + CIHA hospitalisation records. + """ return _fetch_data( dataset="ciha", state=state.upper(), @@ -306,9 +526,39 @@ def list_files( month: int | list[int] | None = None, **kwargs, ) -> pd.DataFrame: - """List catalog files filtered by client, group, state, year, and month.""" + """List catalog files filtered by client, group, state, year, and month. + + Queries the PySUS API metadata and returns a DataFrame with file name, + path, dataset, group, year, month, state, and last-modified timestamp for + every matching file without downloading the actual data. + + Parameters + ---------- + dataset : Literal + Dataset name (e.g. ``"SINAN"``, ``"SINASC"``, etc.). + client : Literal["FTP", "DadosGov"], optional + Data source client to query. + group : str, optional + Group or disease code to filter by. + state : str, optional + Two-letter state abbreviation (e.g. ``"RJ"``). + year : int | list[int], optional + Year or list of years to filter by. + month : int | list[int], optional + Month or list of months to filter by. + **kwargs + Additional arguments forwarded to :meth:`PySUS.query`. + + Returns + ------- + pd.DataFrame + DataFrame with columns ``name``, ``path``, ``dataset``, ``group``, + ``year``, ``month``, ``state``, and ``modify``. + """ async def _list(): + """Coroutine that queries the PySUS API and builds the file list.""" + async with PySUS() as pysus: years = [year] if isinstance(year, int) else (year or [None]) months = [month] if isinstance(month, int) else (month or [None]) diff --git a/pysus/api/client.py b/pysus/api/client.py index 1373755..89f8ff1 100644 --- a/pysus/api/client.py +++ b/pysus/api/client.py @@ -69,7 +69,17 @@ class PySUS: """Central orchestrator for downloading and querying PySUS datasets.""" def __init__(self, db_path: Path = CACHEPATH / "config.db"): - """Initialize PySUS with a DuckDB-backed SQLAlchemy engine.""" + """Initialize the PySUS orchestrator. + + Creates a SQLAlchemy engine backed by DuckDB, initializes the + schema, and sets up the session factory. + + Parameters + ---------- + db_path : Path, optional + Path to the DuckDB database file. Defaults to + ``CACHEPATH / "config.db"``. + """ db_path = Path(db_path) db_path.parent.mkdir(parents=True, exist_ok=True) @@ -241,12 +251,31 @@ async def download( ) -> BaseLocalFile: """Download a remote file and return a local file handle. + Skips re-download if a matching local copy already exists. + Parameters ---------- - timeout : float | None - Maximum seconds to wait for the download. ``None`` (default) means - no timeout – use this when the socket-level timeout on the - underlying client is sufficient. + file : BaseRemoteFile + The remote file to download. + token : str, optional + Access token for authenticated clients (e.g. DadosGov). + callback : Callable, optional + Progress callback invoked during the download. + timeout : float, optional + Maximum seconds to wait for the download. ``None`` (default) + means no timeout. + + Returns + ------- + BaseLocalFile + The downloaded file wrapped in the appropriate handler. + + Raises + ------ + ValueError + If the file's client is not recognised. + RuntimeError + If the download fails for any reason. """ from pysus.api.extensions import ExtensionFactory @@ -332,7 +361,32 @@ async def download_to_parquet( timeout: float | None = None, add_dv: bool = True, ) -> Parquet: - """Download a file and convert it to Parquet format.""" + """Download a file and convert it to Parquet format. + + Parameters + ---------- + file : BaseRemoteFile + The remote file to download and convert. + token : str, optional + Access token for authenticated clients. + callback : Callable[[int, int], None], optional + Progress callback. + timeout : float, optional + Maximum seconds to wait for the download. + add_dv : bool, optional + Whether to apply the IBGE verification digit on load + (default True). + + Returns + ------- + Parquet + The converted Parquet file handler. + + Raises + ------ + NotImplementedError + If the downloaded file type cannot be converted to Parquet. + """ local_file = await self.download( file=file, @@ -368,8 +422,13 @@ async def download_to_parquet( ) def get_local_hierarchy(self): - """ - Build a nested dict of cached files grouped by client and dataset. + """Build a nested dict of cached files grouped by client and dataset. + + Returns + ------- + dict + Nested dict keyed by + ``{client: {dataset: {group: [files]}}}``. """ with self.Session() as session: @@ -446,10 +505,27 @@ def read_parquet( Parameters ---------- - add_dv : bool + paths : list of Path + One or more Parquet file paths to read. + sql : str, optional + Optional SQL filter expression applied to the result. + mode : {"union", "intersection", "strict"}, optional + Schema resolution mode (default ``"union"``). + add_dv : bool, optional When True, automatically applies the IBGE verification digit to - municipality code columns. If there are matching columns, a - DataFrame is returned instead of a DuckDBPyConnection. + municipality code columns. If matching columns are found, a + DataFrame is returned instead of a ``DuckDBPyConnection``. + + Returns + ------- + DuckDBPyConnection or pd.DataFrame + The query result. + + Raises + ------ + ValueError + If no paths are provided, or if the schema mode is ``"strict"`` + and the files have differing schemas. """ from pysus.api.utils import add_dv as _add_dv_fn @@ -459,6 +535,7 @@ def read_parquet( raise ValueError("No paths provided") def get_columns(path: Path) -> set[tuple[str, str]]: + """Return the schema of a Parquet file as (name, type) pairs.""" result = duckdb.execute(f"SELECT * FROM '{path}' LIMIT 0") return {(col[0], str(col[1])) for col in result.description} diff --git a/pysus/api/dadosgov/client.py b/pysus/api/dadosgov/client.py index d56b57b..5487f80 100644 --- a/pysus/api/dadosgov/client.py +++ b/pysus/api/dadosgov/client.py @@ -17,7 +17,19 @@ def to_datetime(value: Any) -> datetime | None: - """Parse a Brazilian date string into a datetime object.""" + """Parse a Brazilian date string into a datetime object. + + Parameters + ---------- + value : Any + The value to parse, expected to be a date string in Brazilian format + (e.g., ``%d/%m/%Y %H:%M:%S`` or ``%d/%m/%Y``). + + Returns + ------- + datetime or None + Parsed datetime object, or None if the value cannot be parsed. + """ if not value or not isinstance(value, str) or "Indisponível" in value: return None for fmt in ("%d/%m/%Y %H:%M:%S", "%d/%m/%Y"): @@ -29,7 +41,18 @@ def to_datetime(value: Any) -> datetime | None: def to_bool(value: Any) -> bool: - """Parse a Brazilian Portuguese boolean value ("sim"/"não") into a bool.""" + """Parse a Brazilian Portuguese boolean value into a bool. + + Parameters + ---------- + value : Any + The value to parse (e.g., ``"sim"``, ``"não"``, ``True``, ``False``). + + Returns + ------- + bool + True if the value represents an affirmative, False otherwise. + """ if isinstance(value, bool): return value return str(value).lower() in ("sim", "true", "1") @@ -48,26 +71,62 @@ class DadosGov(BaseRemoteClient): _client: httpx.AsyncClient | None = PrivateAttr(default=None) def __init__(self, **data): - """Initialize the DadosGov client.""" + """Initialize the DadosGov client. + + Parameters + ---------- + ``**data`` + Additional keyword arguments forwarded to the parent constructor. + """ super().__init__(**data) @property def name(self) -> str: - """Return the short client name.""" + """Return the short client name. + + Returns + ------- + str + The abbreviated client name ``"DadosGov"``. + """ return "DadosGov" @property def long_name(self) -> str: - """Return the human-readable client name.""" + """Return the human-readable client name. + + Returns + ------- + str + The full Portuguese name of the portal. + """ return "Portal Brasileiro de Dados Abertos" @property def description(self) -> str: - """Return a description of the client.""" + """Return a description of the client. + + Returns + ------- + str + A Portuguese description of the API interface. + """ return "Interface de acesso ao API do Portal de Dados Abertos" async def connect(self, token: str | None = None) -> None: - """Connect to the dados.gov.br API with the given token.""" + """Connect to the dados.gov.br API with the given token. + + Parameters + ---------- + token : str, optional + The API authentication token. If not provided, uses the + previously stored token. + + Raises + ------ + ValueError + If no token is provided and none was previously stored. + """ _token = token or self._token if not _token: @@ -95,23 +154,62 @@ async def connect(self, token: str | None = None) -> None: ) async def login(self, token: str | None = None, **kwargs) -> None: - """Authenticate with the API (delegates to connect).""" + """Authenticate with the API. + + Delegates to the :meth:`connect` method. + + Parameters + ---------- + token : str, optional + The API authentication token. + ``**kwargs`` + Additional keyword arguments (currently unused). + """ await self.connect(token=token) async def close(self) -> None: - """Close the underlying HTTP client.""" + """Close the underlying HTTP client and release resources.""" if self._client: await self._client.aclose() self._client = None async def datasets(self, **kwargs) -> list[Dataset]: - """Return a list of pre-configured health datasets.""" + """Return a list of pre-configured health datasets. + + Returns + ------- + list[:class:`~pysus.api.dadosgov.models.Dataset`] + A list of available :class:`~pysus.api.dadosgov.models.Dataset` + instances for known health databases. + """ from .databases import AVAILABLE_DATABASES return [db_class(client=self) for db_class in AVAILABLE_DATABASES] async def list_datasets(self, **kwargs) -> list[ConjuntoDados]: - """Search and list available datasets from the portal.""" + """Search and list available datasets from the portal. + + Parameters + ---------- + ``**kwargs`` + Search parameters. Supported keys: + + - ``pagina`` (int): Page number for pagination. + - ``nome_conjunto`` (str): Filter by dataset name. + - ``dados_abertos`` (bool): Filter by open data flag. + - ``is_privado`` (bool): Filter by private datasets. + - ``id_organizacao`` (str): Filter by organisation ID. + + Returns + ------- + list[ConjuntoDados] + A list of datasets matching the search criteria. + + Raises + ------ + ConnectionError + If the client is not connected. + """ if self._client is None: raise ConnectionError( "Client not connected. Call login(token=...) first.", @@ -136,7 +234,23 @@ async def list_datasets(self, **kwargs) -> list[ConjuntoDados]: return [ConjuntoDados(**item, client=self) for item in data] async def get_dataset(self, id: str) -> ConjuntoDados: - """Fetch a single dataset by its ID.""" + """Fetch a single dataset by its ID. + + Parameters + ---------- + id : str + The unique identifier of the dataset. + + Returns + ------- + ConjuntoDados + The requested dataset. + + Raises + ------ + ConnectionError + If the client is not connected. + """ if self._client is None: raise ConnectionError( "Client not connected. Call login(token=...) first.", @@ -194,7 +308,16 @@ class Recurso(BaseModel): file_name: str | None = Field(None, alias="nomeArquivo") async def get_size(self) -> int: - """Retrieve the file size from the remote server.""" + """Retrieve the file size from the remote server. + + Makes a HEAD request (falling back to GET with a Range header) + to determine the Content-Length of the resource. + + Returns + ------- + int + The file size in bytes, or 0 if the size could not be determined. + """ async with httpx.AsyncClient(follow_redirects=True) as client: response = await client.head(self.url) diff --git a/pysus/api/dadosgov/databases.py b/pysus/api/dadosgov/databases.py index 192587a..0729709 100644 --- a/pysus/api/dadosgov/databases.py +++ b/pysus/api/dadosgov/databases.py @@ -24,6 +24,7 @@ def _parse_year(val: str) -> int | None: + """Parse a year string into an integer within the valid range.""" try: y = int(val) return y if 1970 <= y <= 2100 else None @@ -32,6 +33,7 @@ def _parse_year(val: str) -> int | None: def _skip(name: str) -> bool: + """Check whether a filename should be skipped by naming conventions.""" return name.startswith("get_") or name.lower().endswith(".pdf") @@ -45,16 +47,35 @@ class CNES(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"CNES"``. + """ return "CNES" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Cadastro Nacional de Estabelecimentos de Saúde" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + A Portuguese description of the CNES information system. + """ return ( "O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o " "sistema de informação oficial de cadastramento de informações " @@ -62,7 +83,19 @@ def description(self) -> str: ) def formatter(self, filename: str) -> dict[str, Any]: - """Parse a CNES filename and extract metadata.""" + """Parse a CNES filename and extract metadata. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip() if _skip(name): @@ -108,20 +141,51 @@ class PNI(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"PNI"``. + """ return "PNI" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Programa Nacional de Imunizações" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + A Portuguese description of the PNI vaccination monitoring system. + """ return "O PNI monitora a cobertura vacinal e doses aplicadas no Brasil." def formatter(self, filename: str) -> dict[str, Any]: - """Parse a PNI vaccination filename into month and year.""" + """Parse a PNI vaccination filename into month and year. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip().lower() if _skip(name): @@ -147,22 +211,53 @@ class SIA(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"SIA"``. + """ return "SIA" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informações Ambulatoriais" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + A Portuguese description of the SIA outpatient information system. + """ return """ O SIA acompanha as ações de saúde produzidas no âmbito ambulatorial. """ def formatter(self, filename: str) -> dict[str, Any]: - """Parse an SIA filename into year.""" + """Parse an SIA filename into year. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip().lower() if _skip(name): @@ -214,23 +309,54 @@ class SINAN(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"SINAN"``. + """ return "SINAN" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informação de Agravos de Notificação" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + A Portuguese description of the SINAN notifiable diseases system. + """ return """ O SINAN é alimentado pela notificação de doenças de notificação compulsória """ def formatter(self, filename: str) -> dict[str, Any]: - """Parse a SINAN filename into state and year.""" + """Parse a SINAN filename into state and year. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip().upper() if _skip(name): @@ -270,22 +396,53 @@ class SIM(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"SIM"``. + """ return "SIM" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informação sobre Mortalidade" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + A Portuguese description of the SIM mortality information system. + """ return """ O SIM coleta dados sobre óbitos no país para análise epidemiológica. """ def formatter(self, filename: str) -> dict[str, Any]: - """Parse a SIM filename into year.""" + """Parse a SIM filename into year. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip() if _skip(name): @@ -325,23 +482,54 @@ class SINASC(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"SINASC"``. + """ return "SINASC" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informações sobre Nascidos Vivos" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + Portuguese description of the SINASC live birth system. + """ return """ O SINASC fornece subsídios para o diagnóstico de saúde e planejamento de políticas de natalidade. """ def formatter(self, filename: str) -> dict[str, Any]: - """Parse a SINASC filename into year.""" + """Parse a SINASC filename into year. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip() if _skip(name): @@ -377,20 +565,51 @@ class COVID19(Dataset): @property def name(self) -> str: - """Return the short name.""" + """Return the short name. + + Returns + ------- + str + The abbreviated dataset name ``"COVID19"``. + """ return "COVID19" @property def long_name(self) -> str: - """Return the human-readable name.""" + """Return the human-readable name. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Casos Confirmados de COVID-19" @property def description(self) -> str: + """Return a description of the dataset. + + Returns + ------- + str + A Portuguese description of the COVID-19 confirmed cases dataset. + """ return "Dados anonimizados de casos confirmados de COVID-19." def formatter(self, filename: str) -> dict[str, Any]: - """Parse a COVID-19 filename.""" + """Parse a COVID-19 filename and extract metadata. + + Parameters + ---------- + filename : str + The name of the file to parse. + + Returns + ------- + dict[str, Any] + A dictionary with keys ``state``, ``year``, and ``month``. + Unrecognised files return ``None`` for all keys. + """ try: name = filename.strip().lower() if _skip(name) or name.endswith(".xlsx"): diff --git a/pysus/api/dadosgov/models.py b/pysus/api/dadosgov/models.py index a582eb7..4a962ca 100644 --- a/pysus/api/dadosgov/models.py +++ b/pysus/api/dadosgov/models.py @@ -59,16 +59,33 @@ class File(BaseRemoteFile): _metadata: dict[str, Any] = PrivateAttr(default_factory=dict) def __init__(self, **data): - """Initialize the File with optional metadata.""" + """Initialize the File with optional metadata. + + Parameters + ---------- + **data + Keyword arguments including an optional ``_metadata`` dict + that is stored on the private attribute ``_metadata``. + """ metadata = data.pop("_metadata", {}) super().__init__(**data) self._metadata = metadata def __repr__(self): + """Return the file basename as its string representation.""" return self.basename def model_post_init(self, __context: Any) -> None: - """Fetch remote metadata if size or modify date is missing.""" + """Fetch remote metadata if size or modify date is missing. + + If both ``api_size`` and ``last_modified`` are falsy, schedules a + background task to fetch metadata from the remote server. + + Parameters + ---------- + __context : Any + Pydantic validation context (unused). + """ if not self.record.api_size or not self.record.last_modified: try: loop = asyncio.get_running_loop() @@ -80,19 +97,42 @@ def model_post_init(self, __context: Any) -> None: @property def extension(self) -> str: - """Return the file extension.""" + """Return the file extension. + + Returns + ------- + str + The file extension (e.g., ``".csv"``, ``".zip"``). + """ if self.record.file_name: return pathlib.Path(self.record.file_name).suffix return pathlib.Path(self.record.url.split("/")[-1].split("?")[0]).suffix @property def size(self) -> int: - """Return the file size in bytes.""" + """Return the file size in bytes. + + Returns + ------- + int + The file size, or 0 if unknown. + """ return self.record.api_size or 0 @property def modify(self) -> datetime: - """Return the last modification date.""" + """Return the last modification date. + + Returns + ------- + datetime + The last modification datetime. + + Raises + ------ + ValueError + If the modification date has not been set. + """ m = self.record.last_modified if not m: raise ValueError("File requires a modify date") @@ -100,21 +140,43 @@ def modify(self) -> datetime: @property def year(self) -> int | None: - """Return the inferred year from metadata.""" + """Return the inferred year from metadata. + + Returns + ------- + int or None + The year if present in metadata, otherwise None. + """ return self._metadata.get("year") @property def month(self) -> int | None: - """Return the inferred month from metadata.""" + """Return the inferred month from metadata. + + Returns + ------- + int or None + The month if present in metadata, otherwise None. + """ return self._metadata.get("month") @property def state(self) -> State | None: - """Return the inferred state from metadata.""" + """Return the inferred state from metadata. + + Returns + ------- + State or None + The state abbreviation if present in metadata, otherwise None. + """ return self._metadata.get("state") async def fetch_metadata(self) -> None: - """Fetch file size and last-modified from the remote server.""" + """Fetch file size and last-modified from the remote server. + + Updates ``record.api_size`` and ``record.last_modified`` in-place. + Silently ignores connection errors. + """ try: async with httpx.AsyncClient( follow_redirects=True, @@ -151,7 +213,16 @@ async def _download( return await self.client._download_file(self, output, callback=callback) async def fetch_size(self) -> int: - """Fetch the remote file size and update the local record.""" + """Fetch the remote file size and update the local record. + + Makes a HEAD request (falling back to GET with a Range header) + to determine the Content-Length. + + Returns + ------- + int + The file size in bytes, or 0 if the size could not be determined. + """ try: async with httpx.AsyncClient( follow_redirects=True, @@ -188,30 +259,59 @@ def __init__( dataset: BaseRemoteDataset, formatter: Callable | None = None, ): - """Initialize the Group with a dataset record and optional formatter.""" + """Initialize the Group with a dataset record and optional formatter. + + Parameters + ---------- + record : ConjuntoDados + The API response record for this group. + dataset : BaseRemoteDataset + The parent dataset this group belongs to. + formatter : Callable, optional + A callable that extracts metadata from filenames. + """ super().__init__( record=record, dataset=dataset # type: ignore[call-arg] ) self._formatter = formatter def __repr__(self): + """Return the group name as its string representation.""" return self.name @property def name(self) -> str: - """Return the group name, resolved through dataset aliases.""" + """Return the group name, resolved through dataset aliases. + + Returns + ------- + str + The alias for the group slug if defined, otherwise the raw slug. + """ slug = self.record.slug aliases = getattr(self.dataset, "group_aliases", {}) return aliases.get(slug, slug) @property def long_name(self) -> str: - """Return the group title.""" + """Return the group title. + + Returns + ------- + str + The title of the underlying API record. + """ return self.record.title @property def description(self) -> str: - """Return an empty description.""" + """Return an empty description for the group. + + Returns + ------- + str + An empty string. + """ return "" async def _fetch_files(self) -> list[BaseRemoteFile]: @@ -247,13 +347,18 @@ async def _fetch_files(self) -> list[BaseRemoteFile]: class Dataset(BaseRemoteDataset): - """A health dataset available through dados.gov.br.""" + """A health dataset available through dados.gov.br. + + Subclasses define a list of API dataset IDs and an optional + :meth:`formatter` that extracts metadata from file names. + """ ids: list[str] = [] client: "DadosGov" group_aliases: dict[str, str] = {} def __repr__(self): + """Return the dataset name as its string representation.""" return self.name @abstractmethod diff --git a/pysus/api/ducklake/catalog.py b/pysus/api/ducklake/catalog.py index cc83ba3..a9c0fd0 100644 --- a/pysus/api/ducklake/catalog.py +++ b/pysus/api/ducklake/catalog.py @@ -56,14 +56,36 @@ class CatalogTable(Base): class Origin(enum.Enum): - """Origin type for a dataset: FTP or API.""" + """Origin type for a dataset. + + Attributes + ---------- + FTP : str + Dataset sourced from the FTP server. + API : str + Dataset sourced from an API. + """ FTP = "ftp" API = "api" class CatalogDataset(CatalogTable): - """ORM model for the datasets table, representing a dataset collection.""" + """ORM model for the datasets table, representing a dataset collection. + + Parameters + ---------- + id : int, optional + Primary key (auto-generated by sequence). + name : str + Unique short name for the dataset. + long_name : str + Human-readable full name. + description : str, optional + Optional description of the dataset contents. + origin : Origin + Whether the dataset originates from FTP or an API. + """ __tablename__ = "datasets" @@ -95,7 +117,23 @@ class CatalogDataset(CatalogTable): class ColumnDefinition(CatalogTable): - """ORM model for dataset column metadata (name, type, description).""" + """ORM model for dataset column metadata. + + Parameters + ---------- + id : int, optional + Primary key (auto-generated by sequence). + dataset_id : int + Foreign key referencing the parent dataset. + name : str + Column name. + type : str + Column data type string. + description : str, optional + Optional description of the column. + nullable : bool, optional + Whether the column allows null values. + """ __tablename__ = "dataset_columns" @@ -129,7 +167,21 @@ class ColumnDefinition(CatalogTable): class DatasetGroup(CatalogTable): - """ORM model for dataset groups, grouping related files within a dataset.""" + """ORM model for dataset groups, grouping related files within a dataset. + + Parameters + ---------- + id : int, optional + Primary key (auto-generated by sequence). + name : str + Short name for the group. + dataset_id : int + Foreign key referencing the parent dataset. + long_name : str + Human-readable full name. + description : str, optional + Optional description of the group contents. + """ __tablename__ = "dataset_groups" @@ -162,7 +214,37 @@ class DatasetGroup(CatalogTable): class CatalogFile(CatalogTable): - """ORM model for the files table, representing individual data files.""" + """ORM model for the files table, representing individual data files. + + Parameters + ---------- + id : int, optional + Primary key (auto-generated by sequence). + dataset_id : int + Foreign key referencing the parent dataset. + group_id : int, optional + Foreign key referencing the parent group. + path : str + Object storage path to the file. + size : int + File size in bytes. + rows : int + Number of rows in the file. + modified : datetime + Timestamp of the last known modification. + origin_modified : datetime, optional + Original modification timestamp from the source. + origin_path : str + Original source path of the file. + sha256 : str, optional + SHA-256 hex digest for integrity verification. + year : int, optional + Data year associated with the file. + month : int, optional + Data month associated with the file. + state : str, optional + Two-letter state code associated with the file. + """ __tablename__ = "files" diff --git a/pysus/api/ducklake/client.py b/pysus/api/ducklake/client.py index 47ef426..21e9525 100644 --- a/pysus/api/ducklake/client.py +++ b/pysus/api/ducklake/client.py @@ -24,7 +24,15 @@ class CatalogDatasetAdapter: - """Adapter wrapping a CatalogDataset ORM record for use by File objects.""" + """Adapter wrapping a CatalogDataset ORM record for use by File objects. + + Parameters + ---------- + catalog_dataset : CatalogDataset + The ORM record to wrap. + ducklake : DuckLake + The parent DuckLake client instance. + """ def __init__(self, catalog_dataset: CatalogDataset, ducklake): self.name = catalog_dataset.name @@ -36,12 +44,26 @@ def __init__(self, catalog_dataset: CatalogDataset, ducklake): @property def content(self): - """Query the DuckLake client for files in this dataset.""" + """Query the DuckLake client for files in this dataset. + + Returns + ------- + list + List of files belonging to this dataset. + """ return self.ducklake.query(dataset=self.name.upper()) class DatasetGroupAdapter: - """Adapter wrapping a DatasetGroup ORM record for use by File objects.""" + """Adapter wrapping a DatasetGroup ORM record for use by File objects. + + Parameters + ---------- + dataset_group : DatasetGroup + The ORM record to wrap. + dataset : CatalogDataset + The parent dataset. + """ def __init__(self, dataset_group: DatasetGroup, dataset): self.name = dataset_group.name @@ -50,11 +72,24 @@ def __init__(self, dataset_group: DatasetGroup, dataset): self.dataset = dataset def __str__(self): + """Return the group name as its string representation. + + Returns + ------- + str + The short name of the group. + """ return self.name @property async def files(self): - """Return the list of files in this group.""" + """Return the list of files in this group. + + Returns + ------- + list + List of file objects in this group. + """ return [] async def _fetch_files(self): @@ -62,19 +97,52 @@ async def _fetch_files(self): return [] async def search(self, **kwargs): - """Search for files within this group matching the given criteria.""" + """Search for files within this group matching the given criteria. + + Parameters + ---------- + ``**kwargs`` + Arbitrary filter criteria. + + Returns + ------- + list + List of matching file objects. + """ return [] class DuckLakeCredentials(BaseModel): - """Credentials for authenticating with the S3-compatible object storage.""" + """Credentials for authenticating with the S3-compatible object storage. + + Parameters + ---------- + access_key : SecretStr + The S3 access key ID. + secret_key : SecretStr + The S3 secret access key. + """ access_key: SecretStr secret_key: SecretStr class DuckLake(BaseRemoteClient): - """Client for the DuckLake S3-based public health dataset catalog.""" + """Client for the DuckLake S3-based public health dataset catalog. + + Parameters + ---------- + endpoint : str, optional + S3-compatible object storage endpoint. + region : str, optional + Storage region name. + bucket : str, optional + Bucket name containing the catalog. + credentials : DuckLakeCredentials, optional + Credentials for authenticated S3 operations. + engine : object, optional + Pre-configured SQLAlchemy engine to reuse. + """ endpoint: str = "nbg1.your-objectstorage.com" region: str = "nbg1" @@ -89,7 +157,15 @@ class DuckLake(BaseRemoteClient): _Session: Any = PrivateAttr(default=None) def __init__(self, engine=None, **data): - """Initialize the DuckLake client with an optional existing engine.""" + """Initialize the DuckLake client with an optional existing engine. + + Parameters + ---------- + engine : object, optional + Pre-configured SQLAlchemy engine instead of creating a new one. + ``**data`` + Additional fields passed to the Pydantic base model. + """ super().__init__(**data) self._engine = engine self._cache_dir = Path(CACHEPATH) / "ducklake" @@ -98,22 +174,46 @@ def __init__(self, engine=None, **data): @property def name(self) -> str: - """Return the short name of this client.""" + """Return the short name of this client. + + Returns + ------- + str + The client short name. + """ return "DuckLake" @property def long_name(self) -> str: - """Return the human-readable name of this client.""" + """Return the human-readable name of this client. + + Returns + ------- + str + The client display name. + """ return "PySUS s3 Client" @property def description(self) -> str: - """Return a description of this client.""" + """Return a description of this client. + + Returns + ------- + str + A description string (currently empty). + """ return "" # TODO: @property def catalog_path(self) -> Path: - """Return the local path to the downloaded catalog database.""" + """Return the local path to the downloaded catalog database. + + Returns + ------- + Path + Filesystem path to the local catalog database file. + """ return self._catalog_local @property @@ -127,7 +227,18 @@ def _is_authenticated(self) -> bool: return self.credentials is not None async def datasets(self, **kwargs) -> list[DuckDataset]: - """Return all datasets from the catalog as DuckDataset instances.""" + """Return all datasets from the catalog as DuckDataset instances. + + Parameters + ---------- + ``**kwargs`` + Additional filter arguments (currently unused). + + Returns + ------- + list[DuckDataset] + List of all datasets in the catalog. + """ if not self._Session: await self.connect() @@ -155,7 +266,17 @@ async def login( secret_key: str | None = None, **kwargs, ) -> None: - """Authenticate with S3 credentials and reconnect to the catalog.""" + """Authenticate with S3 credentials and reconnect to the catalog. + + Parameters + ---------- + access_key : str, optional + S3 access key ID. If omitted, credentials are cleared. + secret_key : str, optional + S3 secret access key. If omitted, credentials are cleared. + ``**kwargs`` + Additional arguments (currently unused). + """ if access_key and secret_key: self.credentials = DuckLakeCredentials( access_key=SecretStr(access_key), @@ -216,7 +337,13 @@ def _setup_engine(self): return engine async def connect(self, force: bool = False): - """Connect to the catalog, downloading it first if necessary.""" + """Connect to the catalog, downloading it first if necessary. + + Parameters + ---------- + force : bool, optional + Whether to re-download and re-connect even if already connected. + """ if self._engine and not force: if not self._Session: self._Session = sessionmaker(bind=self._engine) @@ -227,7 +354,13 @@ async def connect(self, force: bool = False): self._Session = sessionmaker(bind=self._engine) async def close(self): - """Dispose the engine, then upload the catalog if authenticated.""" + """Dispose the engine, then upload the catalog if authenticated. + + Raises + ------ + PermissionError + If the client is not authenticated but an upload is required. + """ if self._engine: await to_thread.run_sync(self._engine.dispose) @@ -341,7 +474,28 @@ async def query( year: int | None = None, month: int | None = None, ) -> list[File]: - """Filter catalog files by client, dataset, group, state, year.""" + """Filter catalog files by client, dataset, group, state, year. + + Parameters + ---------- + client : Literal["FTP", "DadosGov"], optional + Source client to filter by. + dataset : str, optional + Dataset name to filter by. + group : str, optional + Group name pattern to filter by (case-insensitive ILIKE). + state : str, optional + Two-letter state code to filter by. + year : int, optional + Year to filter by. + month : int, optional + Month to filter by. + + Returns + ------- + list[:class:`~pysus.api.ducklake.models.File`] + List of matching file objects. + """ if not self._Session: await self.connect() diff --git a/pysus/api/ducklake/models.py b/pysus/api/ducklake/models.py index baf0e66..306a96f 100644 --- a/pysus/api/ducklake/models.py +++ b/pysus/api/ducklake/models.py @@ -24,7 +24,19 @@ class File(BaseRemoteFile): - """A remote file in DuckLake catalog with download and verification.""" + """A remote file in the DuckLake catalog with download and verification. + + Parameters + ---------- + record : CatalogFile + The underlying ORM record. + type : str, optional + File type identifier (default ``"remote"``). + dataset : Any + The parent dataset object. + group : Any, optional + The parent group object, if any. + """ record: CatalogFile = Field(exclude=True) type: str = "remote" @@ -33,32 +45,68 @@ class File(BaseRemoteFile): @property def basename(self) -> str: - """Return the file name without directory components.""" + """Return the file name without directory components. + + Returns + ------- + str + The base file name. + """ return self.path.name @property def extension(self) -> str: - """Return the file extension including the leading dot.""" + """Return the file extension including the leading dot. + + Returns + ------- + str + File extension (e.g. ``'.csv'``). + """ return self.path.suffix @property def size(self) -> int: - """Return the file size in bytes.""" + """Return the file size in bytes. + + Returns + ------- + int + File size in bytes. + """ return self.record.size @property def modify(self) -> datetime: - """Return the last-modified timestamp.""" + """Return the last-modified timestamp. + + Returns + ------- + datetime + The last modification timestamp. + """ return self.record.modified @property def rows(self) -> int: - """Return the number of rows in the file.""" + """Return the number of rows in the file. + + Returns + ------- + int + Row count. + """ return self.record.rows @property def sha256(self) -> str | None: - """Return the SHA-256 hash of the file, if available.""" + """Return the SHA-256 hash of the file, if available. + + Returns + ------- + str or None + SHA-256 hex digest, or None if not recorded. + """ return self.record.sha256 async def _download( @@ -77,7 +125,18 @@ async def _download( ) async def verify(self, path: Path) -> bool: - """Verify the file matches the recorded SHA-256 hash.""" + """Verify the file matches the recorded SHA-256 hash. + + Parameters + ---------- + path : Path + Path to the downloaded file on disk. + + Returns + ------- + bool + True if the hash matches or no hash is recorded, False otherwise. + """ if not self.sha256: return True @@ -93,22 +152,49 @@ def _calculate(): class DuckDataset(BaseRemoteDataset): - """A dataset from the DuckLake catalog, containing groups and files.""" + """A dataset from the DuckLake catalog, containing groups and files. + + Parameters + ---------- + record : CatalogDataset + The underlying ORM record. + client : BaseRemoteClient + The parent client instance. + """ record: CatalogDataset = Field(exclude=True) client: BaseRemoteClient = Field(exclude=True) def __repr__(self) -> str: + """Return a string representation of the dataset. + + Returns + ------- + str + The uppercased dataset name. + """ return self.name.upper() @property def name(self) -> str: - """Return the short name of the dataset.""" + """Return the short name of the dataset. + + Returns + ------- + str + The dataset short name. + """ return self.record.name @property def long_name(self) -> str: - """Return the human-readable name of the dataset.""" + """Return the human-readable name of the dataset. + + Returns + ------- + str + The dataset display name, falling back to the short name. + """ return ( self.record.dataset_metadata.long_name if self.record.dataset_metadata @@ -117,7 +203,13 @@ def long_name(self) -> str: @property def description(self) -> str: - """Return the description of the dataset.""" + """Return the description of the dataset. + + Returns + ------- + str + The dataset description, or an empty string if unavailable. + """ return ( self.record.dataset_metadata.description if self.record.dataset_metadata @@ -149,19 +241,39 @@ async def _fetch_content(self) -> list[Union["DuckGroup", File]]: class DuckGroup(BaseRemoteGroup): - """A group of related files within a DuckLake dataset.""" + """A group of related files within a DuckLake dataset. + + Parameters + ---------- + record : DatasetGroup + The underlying ORM record. + dataset : DuckDataset + The parent dataset instance. + """ record: DatasetGroup = Field(exclude=True) dataset: DuckDataset = Field(exclude=True) @property def name(self) -> str: - """Return the short name of the group.""" + """Return the short name of the group. + + Returns + ------- + str + The group short name. + """ return self.record.name @property def long_name(self) -> str: - """Return the human-readable name of the group.""" + """Return the human-readable name of the group. + + Returns + ------- + str + The group display name, falling back to the short name. + """ return ( self.record.group_metadata.long_name if self.record.group_metadata @@ -170,7 +282,13 @@ def long_name(self) -> str: @property def description(self) -> str: - """Return the description of the group.""" + """Return the description of the group. + + Returns + ------- + str + The group description, or an empty string if unavailable. + """ if self.record.group_metadata: return self.record.group_metadata.description return "" diff --git a/pysus/api/extensions.py b/pysus/api/extensions.py index 0451b42..d918817 100644 --- a/pysus/api/extensions.py +++ b/pysus/api/extensions.py @@ -56,6 +56,7 @@ async def stream( """Yield the file contents in chunks of the given size.""" def _read_sync(): + """Read file chunks synchronously in a thread.""" with open(self.path, "rb") as f: while chunk := f.read(chunk_size): yield chunk @@ -135,6 +136,7 @@ async def _get_encoding(self) -> str: if self._encoding is None: def detect(): + """Detect encoding from file bytes synchronously.""" with open(self.path, "rb") as f: return chardet.detect(f.read(1024 * 300)) @@ -151,6 +153,7 @@ async def _get_sep(self) -> str: encoding = await self._get_encoding() def sniff(): + """Sniff the CSV delimiter synchronously.""" try: with open(self.path, encoding=encoding) as f: sample = f.read(1024 * 10) @@ -168,6 +171,7 @@ async def load(self) -> pd.DataFrame: separator = await self._get_sep() def _read_sync(): + """Read the CSV synchronously in a thread.""" return pd.read_csv( self.path, sep=separator, encoding=encoding, low_memory=False ) @@ -183,6 +187,7 @@ async def stream( separator = await self._get_sep() def _get_reader_sync(): + """Create a CSV chunk reader synchronously in a thread.""" return pd.read_csv( self.path, sep=separator, @@ -233,6 +238,7 @@ async def load(self, parse: bool = True) -> pd.DataFrame: """Read the entire Parquet file into a DataFrame.""" def _load(): + """Read the Parquet file synchronously in a thread.""" df = pd.read_parquet(self.path, engine="pyarrow") if parse: df = self.parse_dftypes(df) @@ -265,12 +271,14 @@ def parse_dftypes(df: pd.DataFrame) -> pd.DataFrame: """Convert known date and integer columns to their proper types.""" def str_to_int(string): + """Convert a string to int, return original if not possible.""" if pd.isna(string): return string clean = str(string).replace(" ", "") return int(clean) if clean.isnumeric() else string def str_to_date(string): + """Convert a date string to date or return original on failure.""" if isinstance(string, str): try: return datetime.strptime(string, "%Y%m%d").date() @@ -308,7 +316,19 @@ def rows(self) -> int: return len(DBFReader(self.path, load=False)) def decode_column(self, value): - """Decode a byte string value using cp1252 encoding.""" + """Decode a raw DBF value, handling byte strings and null bytes. + + Parameters + ---------- + value : bytes or str or Any + The value to decode. + + Returns + ------- + str or Any + The decoded and stripped string, or the original value if it is + neither bytes nor str. + """ if isinstance(value, bytes): return ( value.decode(encoding="cp1252", errors="replace") @@ -323,6 +343,7 @@ async def load(self) -> pd.DataFrame: """Read the entire DBF file into a DataFrame.""" def _load(): + """Read the DBF file synchronously in a thread.""" dbf = DBFReader(self.path, encoding="cp1252", raw=True) df = pd.DataFrame(iter(dbf)) return df.map(self.decode_column) @@ -336,6 +357,7 @@ async def stream( """Yield the DBF records in chunks of the given size.""" def _get_db(): + """Open the DBF reader synchronously in a thread.""" return DBFReader(self.path, encoding="cp1252", raw=True) dbf_file = await to_thread.run_sync(_get_db) @@ -371,6 +393,7 @@ async def to_parquet( raise RuntimeError(f"Could not parse {out} to Parquet") async def _stream_to_single_file(): + """Stream DBF records and write them to a single Parquet file.""" dbf_reader = DBFReader(self.path, encoding="cp1252", raw=True) total_rows = len(dbf_reader) writer = None @@ -538,6 +561,7 @@ async def stream( """Yield the PDF file contents in chunks of the given size.""" def _read(): + """Read PDF file data synchronously.""" with open(self.path, "rb") as f: if chunk_size: while chunk := f.read(chunk_size): @@ -563,6 +587,7 @@ async def list_members(self) -> list[str]: """Return the list of member names inside the archive.""" def _list(): + """List ZIP members synchronously in a thread.""" with zipfile.ZipFile(self.path) as z: return z.namelist() @@ -572,6 +597,7 @@ async def open_member(self, member_name: str) -> bytes: """Read and return the contents of a named archive member.""" def _read(): + """Read a ZIP member synchronously in a thread.""" with zipfile.ZipFile(self.path) as z: return z.read(member_name) @@ -588,6 +614,7 @@ async def extract( target_dir.mkdir(parents=True, exist_ok=True) def _extract_sync(): + """Extract ZIP contents synchronously in a thread.""" with zipfile.ZipFile(self.path) as z: z.extractall(target_dir) @@ -636,6 +663,7 @@ async def _safe_cleanup(self, directory: Path): """Remove a temporary directory and its contents.""" def _cleanup(): + """Remove directory contents synchronously in a thread.""" if not directory.exists(): return @@ -663,6 +691,7 @@ async def load(self) -> bytes: """Decompress and read the entire file contents into memory.""" def _read(): + """Decompress and read synchronously in a thread.""" with gzip.open(self.path, "rb") as f: return f.read() @@ -688,6 +717,7 @@ async def extract( out_file = target_dir / self.path.stem def _decompress(): + """Decompress gzip file synchronously in a thread.""" with ( gzip.open(self.path, "rb") as f_in, open( @@ -714,6 +744,7 @@ async def list_members(self) -> list[str]: """Return the list of member names inside the archive.""" def _list(): + """List Tar members synchronously in a thread.""" with tarfile.open(self.path) as t: return t.getnames() @@ -723,6 +754,7 @@ async def open_member(self, member_name: str) -> bytes: """Read and return the contents of a named archive member.""" def _read(): + """Read a Tar member synchronously in a thread.""" with tarfile.open(self.path) as t: f = t.extractfile(member_name) return f.read() if f else b"" @@ -740,6 +772,7 @@ async def extract( members = await self.list_members() def _extract(): + """Extract Tar contents synchronously in a thread.""" with tarfile.open(self.path) as t: t.extractall(target_dir) @@ -802,6 +835,7 @@ def stream( """Raise ImportError indicating the missing DBC dependency.""" async def _internal_gen(): + """Yield nothing; always raises ImportError.""" raise ImportError(self.import_err) yield pd.DataFrame() @@ -862,7 +896,21 @@ async def _identify(cls, path: Path) -> type[BaseLocalFile] | None: @classmethod async def get_file_class(cls, path: Path) -> type[BaseLocalFile]: - """Return handler class for path, falling back to extension matching.""" + """Return the file handler class for a given path. + + First attempts MIME-type identification; falls back to extension + matching. + + Parameters + ---------- + path : Path + The file path to classify. + + Returns + ------- + type[BaseLocalFile] + The handler class for the file type. + """ mime_class = await cls._identify(path) if mime_class: return mime_class @@ -873,7 +921,21 @@ async def get_file_class(cls, path: Path) -> type[BaseLocalFile]: @classmethod async def instantiate(cls, path: str | Path) -> BaseLocalFile: - """Create and return the appropriate file handler for a given path.""" + """Create and return the appropriate file handler for a path. + + Determines whether the path is a directory or a file, resolves the + handler class, and instantiates it. + + Parameters + ---------- + path : str or Path + The filesystem path to wrap in a handler. + + Returns + ------- + BaseLocalFile + The instantiated file handler. + """ path = Path(path).expanduser().resolve() if await to_thread.run_sync(path.is_dir): return Directory(path=path, type="DIR") diff --git a/pysus/api/ftp/client.py b/pysus/api/ftp/client.py index 3c1b46f..76872e5 100644 --- a/pysus/api/ftp/client.py +++ b/pysus/api/ftp/client.py @@ -48,17 +48,35 @@ class FTP(BaseRemoteClient): @property def name(self) -> str: - """Return the short name of this client.""" + """Return the short name of this client. + + Returns + ------- + str + The client short name ("FTP"). + """ return "FTP" @property def long_name(self) -> str: - """Return the human-readable name of this client.""" + """Return the human-readable name of this client. + + Returns + ------- + str + The human-readable client name. + """ return "Pysus FTP Client" @property def description(self) -> str: - """Return a description of this client's purpose.""" + """Return a description of this client's purpose. + + Returns + ------- + str + A description string explaining the FTP client's capabilities. + """ return """ O cliente FTP do pysus foi desenvolvido para fornecer uma interface assíncrona e moderna para navegação e extração de dados diretamente @@ -69,11 +87,23 @@ def description(self) -> str: @property def ftp(self) -> FTPLib | None: - """Return the underlying ftplib.FTP, or None if not connected.""" + """Return the underlying ftplib.FTP, or None if not connected. + + Returns + ------- + FTPLib | None + The ftplib.FTP instance, or None if not connected. + """ return self._ftp async def connect(self) -> None: - """Establish the FTP connection to the remote host.""" + """Establish the FTP connection to the remote host. + + Raises + ------ + Exception + Any exception raised by ftplib during connection. + """ def _connect(): if self.ftp is None: @@ -83,11 +113,28 @@ def _connect(): await to_thread.run_sync(_connect) async def login(self, **kwargs) -> None: - """Authenticate and connect to the FTP server (alias for connect).""" + """Authenticate and connect to the FTP server (alias for connect). + + Parameters + ---------- + ``**kwargs`` + Forwarded to connect() (currently unused). + + Raises + ------ + Exception + Any exception raised by ftplib during authentication. + """ await self.connect() async def close(self) -> None: - """Close the FTP connection and reset the internal client state.""" + """Close the FTP connection and reset the internal client state. + + Raises + ------ + Exception + Any exception raised by ftplib during disconnection. + """ def _close(): if self.ftp: @@ -101,7 +148,18 @@ def _close(): await to_thread.run_sync(_close) async def datasets(self, **kwargs) -> list[Dataset]: - """Return a list of all available dataset instances for this client.""" + """Return a list of all available dataset instances for this client. + + Returns + ------- + list[:class:`~pysus.api.ftp.models.Dataset`] + A list of Dataset instances for all available databases. + + Raises + ------ + ConnectionError + If the FTP client is not connected. + """ from .databases import AVAILABLE_DATABASES if self.ftp is None: diff --git a/pysus/api/ftp/databases.py b/pysus/api/ftp/databases.py index c08b09d..f84dcd5 100644 --- a/pysus/api/ftp/databases.py +++ b/pysus/api/ftp/databases.py @@ -19,17 +19,35 @@ class CIHA(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "CIHA" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Comunicação de Internação Hospitalar e Ambulatorial" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return ( "A CIHA foi criada para ampliar o processo de planejamento, " "programação, controle, avaliação e regulação da assistência à " @@ -38,7 +56,19 @@ def description(self) -> str: ) def formatter(self, filename: str) -> dict[str, Any]: - """Parse a CIHA filename into group, state, year and month metadata.""" + """Parse a CIHA filename into group, state, year and month metadata. + + Parameters + ---------- + filename : str + The raw CIHA filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``, ``month``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() group_code = name[:4] @@ -88,17 +118,35 @@ class CNES(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "CNES" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Cadastro Nacional de Estabelecimentos de Saúde" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return ( "O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o " "sistema de informação oficial de cadastramento de informações " @@ -106,7 +154,19 @@ def description(self) -> str: ) def formatter(self, filename: str) -> dict[str, Any]: - """Parse a CNES filename into group, state, year and month metadata.""" + """Parse a CNES filename into group, state, year and month metadata. + + Parameters + ---------- + filename : str + The raw CNES filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``, ``month``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() group_code = name[:2] @@ -145,24 +205,54 @@ class SINASC(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "SINASC" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informações sobre Nascidos Vivos" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return """ O SINASC fornece subsídios para o diagnóstico de saúde e planejamento de políticas. """ def formatter(self, filename: str) -> dict[str, Any]: - """Parse a SINASC filename into group, state and year metadata.""" + """Parse a SINASC filename into group, state and year metadata. + + Parameters + ---------- + filename : str + The raw SINASC filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() year_short = name[-2:] @@ -195,21 +285,51 @@ class SIM(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "SIM" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informação sobre Mortalidade" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return "O SIM coleta dados sobre obitos no pais para analise epidemiologica." # noqa def formatter(self, filename: str) -> dict[str, Any]: - """Parse a SIM filename into group, state and year metadata.""" + """Parse a SIM filename into group, state and year metadata. + + Parameters + ---------- + filename : str + The raw SIM filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() if "CID9" in filename: @@ -242,21 +362,51 @@ class PNI(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "PNI" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Programa Nacional de Imunizações" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return "O SI-PNI monitora a cobertura vacinal e doses aplicadas." def formatter(self, filename: str) -> dict[str, Any]: - """Parse a PNI filename into group, state and year metadata.""" + """Parse a PNI filename into group, state and year metadata. + + Parameters + ---------- + filename : str + The raw PNI filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() group_code, state, year_short = name[:4], name[4:6], name[-2:] @@ -296,21 +446,51 @@ class IBGEDATASUS(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "IBGE" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "População Residente e Projeções (IBGE)" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return "Informações sobre a população residente obtidas de Censos." def formatter(self, filename: str) -> dict[str, Any]: - """Parse an IBGE filename into group and year metadata.""" + """Parse an IBGE filename into group and year metadata. + + Parameters + ---------- + filename : str + The raw IBGE filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``year``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() year = name[-2:] @@ -351,21 +531,51 @@ class SIA(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "SIA" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informações Ambulatoriais" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return "O SIA acompanha as ações de saúde produzidas." def formatter(self, filename: str) -> dict[str, Any]: - """Parse an SIA filename into group, state, year and month metadata.""" + """Parse an SIA filename into group, state, year and month metadata. + + Parameters + ---------- + filename : str + The raw SIA filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``, ``month``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() digits = "".join([d for d in name if d.isdigit()]) @@ -409,23 +619,53 @@ class SIH(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "SIH" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informações Hospitalares" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return """ O SIH processa as internações hospitalares financiadas pelo SUS. """ def formatter(self, filename: str) -> dict[str, Any]: - """Parse an SIH filename into group, state, year and month metadata.""" + """Parse an SIH filename into group, state, year and month metadata. + + Parameters + ---------- + filename : str + The raw SIH filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``state``, ``year``, ``month``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() group_code = name[:2] @@ -508,21 +748,51 @@ class SINAN(Dataset): @property def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym (e.g. "CIHA"). + """ return "SINAN" @property def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ return "Sistema de Informação de Agravos de Notificação" @property def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose in Portuguese. + """ return "O SINAN é alimentado pela notificação de doenças compulsórias." def formatter(self, filename: str) -> dict[str, Any]: - """Parse a SINAN filename into group and year metadata.""" + """Parse a SINAN filename into group and year metadata. + + Parameters + ---------- + filename : str + The raw SINAN filename to parse. + + Returns + ------- + dict[str, Any] + A dict with keys ``group``, ``year``. + On parse failure values are set to None. + """ try: name = filename.split(".")[0].upper() year_short = name[-2:] diff --git a/pysus/api/ftp/models.py b/pysus/api/ftp/models.py index 06bf291..88edb1a 100644 --- a/pysus/api/ftp/models.py +++ b/pysus/api/ftp/models.py @@ -28,7 +28,14 @@ class File(BaseRemoteFile): _info: FTPFileInfo = PrivateAttr() def __init__(self, **data): - """Initialise the File with raw FTP metadata.""" + """Initialise the File with raw FTP metadata. + + Parameters + ---------- + **data + Keyword arguments passed to BaseRemoteFile, including + optional ``_info`` with parsed FTP metadata. + """ info = data.pop("_info", None) if "path" not in data and info and "path" in info: data["path"] = info["path"] @@ -47,22 +54,51 @@ def __init__(self, **data): ) def __repr__(self) -> str: - """Return the file name as its string representation.""" + """Return the file name as its string representation. + + Returns + ------- + str + The file name. + """ return self.name @property def extension(self) -> str: - """Return the file extension (e.g. .dbc, .dbf).""" + """Return the file extension (e.g. .dbc, .dbf). + + Returns + ------- + str + The file extension including the leading dot. + """ return Path(self.path).suffix @property def size(self) -> int: - """Return the file size in bytes.""" + """Return the file size in bytes. + + Returns + ------- + int + The file size in bytes. + """ return self._info.get("size", 0) @property def modify(self) -> datetime: - """Return the last modification timestamp.""" + """Return the last modification timestamp. + + Returns + ------- + datetime + The file's last modification datetime. + + Raises + ------ + ValueError + If no modification date is available. + """ m = self._info.get("modify") if not m: raise ValueError("File requires a modify date") @@ -70,17 +106,35 @@ def modify(self) -> datetime: @property def year(self) -> int | None: - """Return the data year extracted from the filename, if available.""" + """Return the data year extracted from the filename, if available. + + Returns + ------- + int | None + The year as an integer, or None if not available. + """ return self._info.get("year") @property def month(self) -> int | None: - """Return the data month extracted from the filename, if available.""" + """Return the data month extracted from the filename, if available. + + Returns + ------- + int | None + The month as an integer, or None if not available. + """ return self._info.get("month") @property def state(self) -> State | None: - """Return the state code extracted from the filename, if available.""" + """Return the state code extracted from the filename, if available. + + Returns + ------- + State | None + The state code, or None if not available. + """ return self._info.get("state", None) async def _download( @@ -108,7 +162,21 @@ def __init__( formatter: Callable | None = None, dataset: Dataset | None = None, ): - """Initialise the Directory with a remote path and optional context.""" + """Initialise the Directory with a remote path and optional context. + + Parameters + ---------- + path : str + The remote directory path. + parent : Directory | Dataset | Group | None, optional + The parent directory, dataset or group. + client : BaseRemoteClient | None, optional + The FTP client instance. + formatter : Callable | None, optional + A filename formatter function. + dataset : Dataset | None, optional + The dataset this directory belongs to. + """ self.path = os.path.normpath(path) self.parent = parent self.dataset = dataset or getattr(parent, "dataset", None) @@ -120,13 +188,25 @@ def __init__( @property async def content(self) -> list[Directory | File]: - """Return the directory contents, loading from FTP if not yet cached.""" + """Return the directory contents, loading from FTP if not yet cached. + + Returns + ------- + list[Directory | File] + The list of files and subdirectories. + """ if not self.loaded: await self.load() return self._content async def load(self) -> None: - """Fetch and parse the directory listing from the FTP server.""" + """Fetch and parse the directory listing from the FTP server. + + Raises + ------ + ValueError + If the client is not an FTP instance. + """ if not isinstance(self.client, FTP): raise ValueError("no ftp client found") raw_infos = await self.client._list_directory( @@ -162,11 +242,23 @@ async def load(self) -> None: self.loaded = True def __str__(self) -> str: - """Return the normalised directory path.""" + """Return the normalised directory path. + + Returns + ------- + str + The normalised path string. + """ return self.path def __repr__(self) -> str: - """Return a debug representation of this directory.""" + """Return a debug representation of this directory. + + Returns + ------- + str + A debug string with the directory path. + """ return f"" @@ -188,7 +280,23 @@ def __init__( description: str = "", **data: Any, ): - """Initialise the Group with metadata and a directory reference.""" + """Initialise the Group with metadata and a directory reference. + + Parameters + ---------- + name : str + The group short code. + path : str + The remote directory path for this group. + dataset : Dataset + The parent dataset. + long_name : str + The human-readable group name. + description : str, optional + A description of the group. + **data : Any + Additional keyword arguments. + """ data.update({"dataset": dataset, "path": path}) super().__init__(**data) @@ -205,22 +313,46 @@ def __init__( @property def name(self) -> str: - """Return the group short code (e.g. 'RD', 'PA').""" + """Return the group short code (e.g. 'RD', 'PA'). + + Returns + ------- + str + The group short code. + """ return self._name @property def long_name(self) -> str: - """Return the human-readable group name.""" + """Return the human-readable group name. + + Returns + ------- + str + The human-readable group name. + """ return self._long_name @property def description(self) -> str: - """Return the group description.""" + """Return the group description. + + Returns + ------- + str + The group description. + """ return self._description @property async def content(self) -> list[Directory | File]: - """Return the contents of the underlying directory.""" + """Return the contents of the underlying directory. + + Returns + ------- + list[Directory | File] + The directory contents. + """ return await self._dir.content async def _fetch_files(self) -> list[BaseRemoteFile]: @@ -238,21 +370,50 @@ class Dataset(BaseRemoteDataset, ABC): @property @abstractmethod def name(self) -> str: - """Return the dataset short name.""" + """Return the dataset short name. + + Returns + ------- + str + The dataset acronym. + """ @property @abstractmethod def long_name(self) -> str: - """Return the dataset full name in Portuguese.""" + """Return the dataset full name in Portuguese. + + Returns + ------- + str + The full Portuguese name of the dataset. + """ @property @abstractmethod def description(self) -> str: - """Return a description of the dataset's purpose.""" + """Return a description of the dataset's purpose. + + Returns + ------- + str + A description of the dataset's purpose. + """ @abstractmethod def formatter(self, filename: str) -> dict[str, Any]: - """Parse a filename into metadata (group, state, year, etc.).""" + """Parse a filename into metadata (group, state, year, etc.). + + Parameters + ---------- + filename : str + The raw filename to parse. + + Returns + ------- + dict[str, Any] + A dictionary of parsed metadata fields. + """ async def _fetch_content( self, @@ -289,5 +450,11 @@ async def _fetch_content( return results def __repr__(self) -> str: - """Return the dataset short name as its string representation.""" + """Return the dataset short name as its string representation. + + Returns + ------- + str + The dataset short name. + """ return self.name diff --git a/pysus/api/models.py b/pysus/api/models.py index 5e883c3..6f9029a 100644 --- a/pysus/api/models.py +++ b/pysus/api/models.py @@ -56,6 +56,7 @@ def basename(self) -> str: return self.path.name def __str__(self) -> str: + """Return the file's basename as its string representation.""" return self.basename @property @@ -84,6 +85,7 @@ class BaseLocalFile(BaseFile, ABC): @property def name(self) -> str: + """Return the file name from the path.""" return self.path.name async def get_hash( @@ -91,12 +93,21 @@ async def get_hash( ) -> str: """Compute the file's hash digest. - *algorithm* is the hash algorithm name (default "sha256"). - *chunk_size* is the read chunk size in bytes. - Return the hex digest string. + Parameters + ---------- + algorithm : str, optional + The hash algorithm name (default ``"sha256"``). + chunk_size : int, optional + Read chunk size in bytes (default 1 MiB). + + Returns + ------- + str + The hex digest string. """ def _compute_hash(): + """Compute the hash digest in a thread-safe manner.""" hash_obj = hashlib.new(algorithm) with open(self.path, "rb") as f: while chunk := f.read(chunk_size): @@ -118,14 +129,17 @@ def stream( @property def extension(self) -> str: + """Return the file extension from the local path.""" return self.path.suffix @property def size(self) -> int: + """Return the file size in bytes from the local filesystem.""" return self.path.stat().st_size @property def modify(self) -> datetime: + """Return the last modification timestamp from the local filesystem.""" return datetime.fromtimestamp(self.path.stat().st_mtime) @@ -164,10 +178,21 @@ async def to_parquet( ) -> Parquet: """Convert the file to Parquet format. - *output_path* is the destination path; defaults to the source path - with a .parquet extension. *chunk_size* controls the streaming chunk - size. *callback* receives (current_rows, total_rows) after each chunk. - Return the resulting Parquet wrapper object. + Parameters + ---------- + output_path : str or Path, optional + Destination path for the Parquet file. Defaults to the source + path with a ``.parquet`` extension. + chunk_size : int, optional + Number of rows per streaming chunk (default 10 000). + callback : Callable[[int, int], None], optional + Function called after each chunk with + ``(current_rows, total_rows)``. + + Returns + ------- + Parquet + The resulting Parquet wrapper object. """ from pysus.api.extensions import ExtensionFactory, Parquet @@ -273,7 +298,7 @@ class SearchableMixin: """Mixin providing attribute-based filtering for remote objects.""" def _matches(self, obj: Any, **kwargs) -> bool: - """Return True if all *kwargs* attributes match on *obj*.""" + """Return True if all *kwargs* attributes equal those on *obj*.""" for key, value in kwargs.items(): obj_value = getattr(obj, key, None) if obj_value != value: @@ -293,6 +318,7 @@ class BaseRemoteFile(BaseFile, SearchableMixin, ABC): @property def name(self) -> str: + """Return the basename as the display name.""" return self.basename @property @@ -364,6 +390,7 @@ class BaseRemoteObject(BaseModel, ABC): model_config = ConfigDict(arbitrary_types_allowed=True) def __str__(self) -> str: + """Return the short name as the string representation.""" return self.name @property diff --git a/pysus/api/utils.py b/pysus/api/utils.py index 1e82735..1a7c0f7 100644 --- a/pysus/api/utils.py +++ b/pysus/api/utils.py @@ -17,6 +17,19 @@ def is_geocode_column(name: str) -> bool: def add_dv(geocode: str) -> str: + """Add the IBGE verification digit to a municipality code. + + Parameters + ---------- + geocode : str + The municipality code (6 or 7 digits). + + Returns + ------- + str + The code with the verification digit appended, or the original + string if it cannot be processed. + """ if not geocode or not str(geocode).isdigit(): return geocode