cbrkit.loaders

This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a validate function is also provided.

  1"""
  2This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a `validate` function is also provided.
  3"""
  4
  5import csv as csvlib
  6from collections.abc import Callable, Iterable, Iterator, Mapping
  7from dataclasses import dataclass
  8from pathlib import Path
  9from typing import Any, BinaryIO, TextIO, cast
 10
 11import orjson
 12import pandas as pd
 13import polars as pl
 14import rtoml
 15import xmltodict
 16import yaml as yamllib
 17from pydantic import BaseModel
 18
 19from .helpers import load_object
 20from .typing import Casebase, ConversionFunc, FilePath
 21
 22__all__ = [
 23    "path",
 24    "file",
 25    "directory",
 26    "validate",
 27    "csv",
 28    "json",
 29    "polars",
 30    "pandas",
 31    "py",
 32    "toml",
 33    "txt",
 34    "xml",
 35    "yaml",
 36]
 37
 38AnyIO = TextIO | BinaryIO
 39ReadableType = str | bytes | TextIO | BinaryIO
 40
 41
 42def read(data: ReadableType) -> str:
 43    if isinstance(data, str):
 44        return data
 45
 46    elif isinstance(data, bytes | bytearray):
 47        return data.decode("utf-8")
 48
 49    return read(data.read())  # pyright: ignore
 50
 51
 52@dataclass(slots=True, frozen=True)
 53class pandas(Mapping[int, pd.Series]):
 54    """A wrapper around a pandas DataFrame to provide a dict-like interface"""
 55
 56    df: pd.DataFrame
 57
 58    def __getitem__(self, key: int | str) -> pd.Series:
 59        if isinstance(key, str):
 60            return cast(pd.Series, self.df.loc[key])
 61
 62        return cast(pd.Series, self.df.iloc[key])
 63
 64    def __iter__(self) -> Iterator[int]:
 65        return iter(range(self.df.shape[0]))
 66
 67    def __len__(self) -> int:
 68        return self.df.shape[0]
 69
 70
 71@dataclass(slots=True, frozen=True)
 72class polars(Mapping[int, dict[str, Any]]):
 73    """A wrapper around a polars DataFrame to provide a dict-like interface"""
 74
 75    df: pl.DataFrame
 76
 77    def __getitem__(self, key: int) -> dict[str, Any]:
 78        return self.df.row(key, named=True)
 79
 80    def __iter__(self) -> Iterator[int]:
 81        return iter(range(self.df.shape[0]))
 82
 83    def __len__(self) -> int:
 84        return self.df.shape[0]
 85
 86
 87@dataclass(slots=True, frozen=True)
 88class py(ConversionFunc[str, Any]):
 89    """Reads a Python file and loads the object from it."""
 90
 91    def __call__(self, source: str) -> Any:
 92        return load_object(source)
 93
 94
 95@dataclass(slots=True, frozen=True)
 96class csv(ConversionFunc[Iterable[str] | ReadableType, dict[int, dict[str, str]]]):
 97    """Reads a csv file and converts it into a dict representation"""
 98
 99    def __call__(
100        self, source: Iterable[str] | ReadableType
101    ) -> dict[int, dict[str, str]]:
102        if isinstance(source, ReadableType):
103            source = read(source).splitlines()
104
105        reader = csvlib.DictReader(source)  # pyright: ignore
106        data: dict[int, dict[str, str]] = {}
107        row: dict[str, str]
108
109        for idx, row in enumerate(reader):
110            data[idx] = row
111
112        return data
113
114
115@dataclass(slots=True, frozen=True)
116class json(ConversionFunc[ReadableType, dict[Any, Any]]):
117    """Reads a json file and converts it into a dict representation"""
118
119    def __call__(self, source: ReadableType) -> dict[Any, Any]:
120        data = orjson.loads(read(source))
121
122        if isinstance(data, list):
123            return dict(enumerate(data))
124        elif isinstance(data, dict):
125            return data
126
127        raise TypeError(f"Invalid data type: {type(data)}")
128
129
130@dataclass(slots=True, frozen=True)
131class toml(ConversionFunc[ReadableType, dict[str, Any]]):
132    """Reads a toml file and converts it into a dict representation"""
133
134    def __call__(self, source: ReadableType) -> dict[str, Any]:
135        return rtoml.loads(read(source))
136
137
138@dataclass(slots=True, frozen=True)
139class yaml(ConversionFunc[ReadableType, dict[Any, Any]]):
140    """Reads a yaml file and converts it into a dict representation"""
141
142    def __call__(self, source: ReadableType) -> dict[Any, Any]:
143        data: dict[Any, Any] = {}
144
145        for doc_idx, doc in enumerate(yamllib.safe_load_all(source)):
146            if isinstance(doc, list):
147                for idx, item in enumerate(doc):
148                    data[doc_idx + idx] = item
149            elif isinstance(doc, dict):
150                data |= doc
151            else:
152                raise TypeError(f"Invalid document type: {type(doc)}")
153
154        return data
155
156
157@dataclass(slots=True, frozen=True)
158class xml(ConversionFunc[ReadableType, dict[str, Any]]):
159    """Reads a xml file and converts it into a dict representation"""
160
161    def __call__(self, source: ReadableType) -> dict[str, Any]:
162        data = xmltodict.parse(read(source))
163
164        if len(data) == 1:
165            data_without_root = data[next(iter(data))]
166
167            return data_without_root
168
169        return data
170
171
172@dataclass(slots=True, frozen=True)
173class txt(ConversionFunc[ReadableType, str]):
174    """Reads a text file and converts it into a string"""
175
176    def __call__(self, source: ReadableType) -> str:
177        return read(source)
178
179
180def _csv_polars(source: Path | ReadableType) -> Mapping[int, dict[str, Any]]:
181    return polars(pl.read_csv(source))
182
183
184StructuredLoader = Callable[[AnyIO], Mapping[Any, Any]]
185AnyLoader = Callable[[AnyIO], Any]
186
187structured_loaders: dict[str, StructuredLoader] = {
188    ".json": json(),
189    ".toml": toml(),
190    ".yaml": yaml(),
191    ".yml": yaml(),
192    ".xml": xml(),
193    ".csv": _csv_polars,
194}
195
196any_loaders: dict[str, AnyLoader] = {
197    **structured_loaders,
198    ".txt": txt(),
199}
200
201
202def path(
203    path: FilePath, pattern: str | None = None, loader: AnyLoader | None = None
204) -> Casebase[Any, Any]:
205    """Converts a path into a Casebase. The path can be a directory or a file.
206
207    Args:
208        path: Path of the file.
209
210    Returns:
211        Returns a Casebase.
212
213    Examples:
214        >>> file_path = "./data/cars-1k.csv"
215        >>> result = path(file_path)
216    """
217    if isinstance(path, str):
218        path = Path(path)
219
220    if path.is_file():
221        return file(path, loader)
222    elif path.is_dir():
223        return directory(path, pattern)
224
225    raise FileNotFoundError(path)
226
227
228def file(path: FilePath, loader: StructuredLoader | None = None) -> Casebase[Any, Any]:
229    """Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.
230
231    Args:
232        path: Path of the file.
233
234    Returns:
235        Returns a Casebase.
236
237    Examples:
238        >>> from pathlib import Path
239        >>> file_path = Path("./data/cars-1k.csv")
240        >>> result = file(file_path)
241
242    """
243    if isinstance(path, str):
244        path = Path(path)
245
246    if loader is None and path.suffix not in structured_loaders:
247        raise ValueError(f"Unsupported file type: {path.suffix}")
248
249    if loader is None:
250        loader = structured_loaders[path.suffix]
251
252    with path.open("rb") as fp:
253        return loader(fp)
254
255
256def directory(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
257    """Converts the files of a directory into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.
258
259    Args:
260        path: Path of the directory.
261        pattern: Relative pattern for the files.
262
263    Returns:
264        Returns a Casebase.
265
266    Examples:
267        >>> from pathlib import Path
268        >>> directory_path = Path("./data")
269        >>> result = directory(directory_path, "*.csv")
270        >>> assert result is not None
271    """
272    cb: Casebase[Any, Any] = {}
273
274    if isinstance(path, str):
275        path = Path(path)
276
277    for elem in path.glob(pattern or "*"):
278        if elem.is_file() and elem.suffix in any_loaders:
279            loader = any_loaders[elem.suffix]
280
281            with elem.open("rb") as fp:
282                cb[elem.stem] = loader(fp)
283
284    return cb
285
286
287def validate[K, V: BaseModel](
288    casebase: Casebase[K, Any], model: type[V]
289) -> Casebase[K, V]:
290    """Validates the casebase against a Pydantic model.
291
292    Args:
293        casebase: Casebase where the values are the data to validate.
294        model: Pydantic model to validate the data.
295
296    Examples:
297        >>> from pydantic import BaseModel, NonNegativeInt
298        >>> from typing import Literal
299        >>> class Car(BaseModel):
300        ...     price: NonNegativeInt
301        ...     year: NonNegativeInt
302        ...     manufacturer: str
303        ...     make: str
304        ...     fuel: Literal["gas", "diesel"]
305        ...     miles: NonNegativeInt
306        ...     title_status: Literal["clean", "rebuilt"]
307        ...     transmission: Literal["automatic", "manual"]
308        ...     drive: Literal["fwd", "rwd", "4wd"]
309        ...     type: str
310        ...     paint_color: str
311        >>> data = file("data/cars-1k.csv")
312        >>> casebase = validate(data, Car)
313        >>> data = polars(pl.read_csv("data/cars-1k.csv"))
314        >>> casebase = validate(data, Car)
315    """
316
317    return {key: model.model_validate(value) for key, value in casebase.items()}
def path( path: FilePath, pattern: str | None = None, loader: Callable[[typing.TextIO | typing.BinaryIO], typing.Any] | None = None) -> Casebase[typing.Any, typing.Any]:
203def path(
204    path: FilePath, pattern: str | None = None, loader: AnyLoader | None = None
205) -> Casebase[Any, Any]:
206    """Converts a path into a Casebase. The path can be a directory or a file.
207
208    Args:
209        path: Path of the file.
210
211    Returns:
212        Returns a Casebase.
213
214    Examples:
215        >>> file_path = "./data/cars-1k.csv"
216        >>> result = path(file_path)
217    """
218    if isinstance(path, str):
219        path = Path(path)
220
221    if path.is_file():
222        return file(path, loader)
223    elif path.is_dir():
224        return directory(path, pattern)
225
226    raise FileNotFoundError(path)

Converts a path into a Casebase. The path can be a directory or a file.

Arguments:
  • path: Path of the file.
Returns:

Returns a Casebase.

Examples:
>>> file_path = "./data/cars-1k.csv"
>>> result = path(file_path)
def file( path: FilePath, loader: Callable[[typing.TextIO | typing.BinaryIO], Mapping[typing.Any, typing.Any]] | None = None) -> Casebase[typing.Any, typing.Any]:
229def file(path: FilePath, loader: StructuredLoader | None = None) -> Casebase[Any, Any]:
230    """Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.
231
232    Args:
233        path: Path of the file.
234
235    Returns:
236        Returns a Casebase.
237
238    Examples:
239        >>> from pathlib import Path
240        >>> file_path = Path("./data/cars-1k.csv")
241        >>> result = file(file_path)
242
243    """
244    if isinstance(path, str):
245        path = Path(path)
246
247    if loader is None and path.suffix not in structured_loaders:
248        raise ValueError(f"Unsupported file type: {path.suffix}")
249
250    if loader is None:
251        loader = structured_loaders[path.suffix]
252
253    with path.open("rb") as fp:
254        return loader(fp)

Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.

Arguments:
  • path: Path of the file.
Returns:

Returns a Casebase.

Examples:
>>> from pathlib import Path
>>> file_path = Path("./data/cars-1k.csv")
>>> result = file(file_path)
def directory( path: FilePath, pattern: str | None = None) -> Casebase[typing.Any, typing.Any]:
257def directory(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]:
258    """Converts the files of a directory into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.
259
260    Args:
261        path: Path of the directory.
262        pattern: Relative pattern for the files.
263
264    Returns:
265        Returns a Casebase.
266
267    Examples:
268        >>> from pathlib import Path
269        >>> directory_path = Path("./data")
270        >>> result = directory(directory_path, "*.csv")
271        >>> assert result is not None
272    """
273    cb: Casebase[Any, Any] = {}
274
275    if isinstance(path, str):
276        path = Path(path)
277
278    for elem in path.glob(pattern or "*"):
279        if elem.is_file() and elem.suffix in any_loaders:
280            loader = any_loaders[elem.suffix]
281
282            with elem.open("rb") as fp:
283                cb[elem.stem] = loader(fp)
284
285    return cb

Converts the files of a directory into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.

Arguments:
  • path: Path of the directory.
  • pattern: Relative pattern for the files.
Returns:

Returns a Casebase.

Examples:
>>> from pathlib import Path
>>> directory_path = Path("./data")
>>> result = directory(directory_path, "*.csv")
>>> assert result is not None
def validate(casebase: Casebase[K, typing.Any], model: type[V]) -> Casebase[K, V]:
288def validate[K, V: BaseModel](
289    casebase: Casebase[K, Any], model: type[V]
290) -> Casebase[K, V]:
291    """Validates the casebase against a Pydantic model.
292
293    Args:
294        casebase: Casebase where the values are the data to validate.
295        model: Pydantic model to validate the data.
296
297    Examples:
298        >>> from pydantic import BaseModel, NonNegativeInt
299        >>> from typing import Literal
300        >>> class Car(BaseModel):
301        ...     price: NonNegativeInt
302        ...     year: NonNegativeInt
303        ...     manufacturer: str
304        ...     make: str
305        ...     fuel: Literal["gas", "diesel"]
306        ...     miles: NonNegativeInt
307        ...     title_status: Literal["clean", "rebuilt"]
308        ...     transmission: Literal["automatic", "manual"]
309        ...     drive: Literal["fwd", "rwd", "4wd"]
310        ...     type: str
311        ...     paint_color: str
312        >>> data = file("data/cars-1k.csv")
313        >>> casebase = validate(data, Car)
314        >>> data = polars(pl.read_csv("data/cars-1k.csv"))
315        >>> casebase = validate(data, Car)
316    """
317
318    return {key: model.model_validate(value) for key, value in casebase.items()}

Validates the casebase against a Pydantic model.

Arguments:
  • casebase: Casebase where the values are the data to validate.
  • model: Pydantic model to validate the data.
Examples:
>>> from pydantic import BaseModel, NonNegativeInt
>>> from typing import Literal
>>> class Car(BaseModel):
...     price: NonNegativeInt
...     year: NonNegativeInt
...     manufacturer: str
...     make: str
...     fuel: Literal["gas", "diesel"]
...     miles: NonNegativeInt
...     title_status: Literal["clean", "rebuilt"]
...     transmission: Literal["automatic", "manual"]
...     drive: Literal["fwd", "rwd", "4wd"]
...     type: str
...     paint_color: str
>>> data = file("data/cars-1k.csv")
>>> casebase = validate(data, Car)
>>> data = polars(pl.read_csv("data/cars-1k.csv"))
>>> casebase = validate(data, Car)
 96@dataclass(slots=True, frozen=True)
 97class csv(ConversionFunc[Iterable[str] | ReadableType, dict[int, dict[str, str]]]):
 98    """Reads a csv file and converts it into a dict representation"""
 99
100    def __call__(
101        self, source: Iterable[str] | ReadableType
102    ) -> dict[int, dict[str, str]]:
103        if isinstance(source, ReadableType):
104            source = read(source).splitlines()
105
106        reader = csvlib.DictReader(source)  # pyright: ignore
107        data: dict[int, dict[str, str]] = {}
108        row: dict[str, str]
109
110        for idx, row in enumerate(reader):
111            data[idx] = row
112
113        return data

Reads a csv file and converts it into a dict representation

116@dataclass(slots=True, frozen=True)
117class json(ConversionFunc[ReadableType, dict[Any, Any]]):
118    """Reads a json file and converts it into a dict representation"""
119
120    def __call__(self, source: ReadableType) -> dict[Any, Any]:
121        data = orjson.loads(read(source))
122
123        if isinstance(data, list):
124            return dict(enumerate(data))
125        elif isinstance(data, dict):
126            return data
127
128        raise TypeError(f"Invalid data type: {type(data)}")

Reads a json file and converts it into a dict representation

@dataclass(slots=True, frozen=True)
class polars(collections.abc.Mapping[int, dict[str, typing.Any]]):
72@dataclass(slots=True, frozen=True)
73class polars(Mapping[int, dict[str, Any]]):
74    """A wrapper around a polars DataFrame to provide a dict-like interface"""
75
76    df: pl.DataFrame
77
78    def __getitem__(self, key: int) -> dict[str, Any]:
79        return self.df.row(key, named=True)
80
81    def __iter__(self) -> Iterator[int]:
82        return iter(range(self.df.shape[0]))
83
84    def __len__(self) -> int:
85        return self.df.shape[0]

A wrapper around a polars DataFrame to provide a dict-like interface

polars(df: polars.dataframe.frame.DataFrame)
df: polars.dataframe.frame.DataFrame
@dataclass(slots=True, frozen=True)
class pandas(collections.abc.Mapping[int, pandas.core.series.Series]):
53@dataclass(slots=True, frozen=True)
54class pandas(Mapping[int, pd.Series]):
55    """A wrapper around a pandas DataFrame to provide a dict-like interface"""
56
57    df: pd.DataFrame
58
59    def __getitem__(self, key: int | str) -> pd.Series:
60        if isinstance(key, str):
61            return cast(pd.Series, self.df.loc[key])
62
63        return cast(pd.Series, self.df.iloc[key])
64
65    def __iter__(self) -> Iterator[int]:
66        return iter(range(self.df.shape[0]))
67
68    def __len__(self) -> int:
69        return self.df.shape[0]

A wrapper around a pandas DataFrame to provide a dict-like interface

pandas(df: pandas.core.frame.DataFrame)
df: pandas.core.frame.DataFrame
@dataclass(slots=True, frozen=True)
class py(cbrkit.typing.ConversionFunc[str, typing.Any]):
88@dataclass(slots=True, frozen=True)
89class py(ConversionFunc[str, Any]):
90    """Reads a Python file and loads the object from it."""
91
92    def __call__(self, source: str) -> Any:
93        return load_object(source)

Reads a Python file and loads the object from it.

131@dataclass(slots=True, frozen=True)
132class toml(ConversionFunc[ReadableType, dict[str, Any]]):
133    """Reads a toml file and converts it into a dict representation"""
134
135    def __call__(self, source: ReadableType) -> dict[str, Any]:
136        return rtoml.loads(read(source))

Reads a toml file and converts it into a dict representation

@dataclass(slots=True, frozen=True)
class txt(cbrkit.typing.ConversionFunc[str | bytes | typing.TextIO | typing.BinaryIO, str]):
173@dataclass(slots=True, frozen=True)
174class txt(ConversionFunc[ReadableType, str]):
175    """Reads a text file and converts it into a string"""
176
177    def __call__(self, source: ReadableType) -> str:
178        return read(source)

Reads a text file and converts it into a string

158@dataclass(slots=True, frozen=True)
159class xml(ConversionFunc[ReadableType, dict[str, Any]]):
160    """Reads a xml file and converts it into a dict representation"""
161
162    def __call__(self, source: ReadableType) -> dict[str, Any]:
163        data = xmltodict.parse(read(source))
164
165        if len(data) == 1:
166            data_without_root = data[next(iter(data))]
167
168            return data_without_root
169
170        return data

Reads a xml file and converts it into a dict representation

139@dataclass(slots=True, frozen=True)
140class yaml(ConversionFunc[ReadableType, dict[Any, Any]]):
141    """Reads a yaml file and converts it into a dict representation"""
142
143    def __call__(self, source: ReadableType) -> dict[Any, Any]:
144        data: dict[Any, Any] = {}
145
146        for doc_idx, doc in enumerate(yamllib.safe_load_all(source)):
147            if isinstance(doc, list):
148                for idx, item in enumerate(doc):
149                    data[doc_idx + idx] = item
150            elif isinstance(doc, dict):
151                data |= doc
152            else:
153                raise TypeError(f"Invalid document type: {type(doc)}")
154
155        return data

Reads a yaml file and converts it into a dict representation