cbrkit.loaders
This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a validate
function is also provided.
1""" 2This module provides several loaders to read data from different file formats and convert it into a Casebase. To validate the data against a Pydantic model, a `validate` function is also provided. 3""" 4 5import csv as csvlib 6from collections.abc import Callable, Iterable, Iterator, Mapping 7from dataclasses import dataclass 8from pathlib import Path 9from typing import Any, BinaryIO, TextIO, cast 10 11import orjson 12import pandas as pd 13import polars as pl 14import rtoml 15import xmltodict 16import yaml as yamllib 17from pydantic import BaseModel 18 19from .helpers import load_object 20from .typing import Casebase, ConversionFunc, FilePath 21 22__all__ = [ 23 "path", 24 "file", 25 "directory", 26 "validate", 27 "csv", 28 "json", 29 "polars", 30 "pandas", 31 "py", 32 "toml", 33 "txt", 34 "xml", 35 "yaml", 36] 37 38AnyIO = TextIO | BinaryIO 39ReadableType = str | bytes | TextIO | BinaryIO 40 41 42def read(data: ReadableType) -> str: 43 if isinstance(data, str): 44 return data 45 46 elif isinstance(data, bytes | bytearray): 47 return data.decode("utf-8") 48 49 return read(data.read()) # pyright: ignore 50 51 52@dataclass(slots=True, frozen=True) 53class pandas(Mapping[int, pd.Series]): 54 """A wrapper around a pandas DataFrame to provide a dict-like interface""" 55 56 df: pd.DataFrame 57 58 def __getitem__(self, key: int | str) -> pd.Series: 59 if isinstance(key, str): 60 return cast(pd.Series, self.df.loc[key]) 61 62 return cast(pd.Series, self.df.iloc[key]) 63 64 def __iter__(self) -> Iterator[int]: 65 return iter(range(self.df.shape[0])) 66 67 def __len__(self) -> int: 68 return self.df.shape[0] 69 70 71@dataclass(slots=True, frozen=True) 72class polars(Mapping[int, dict[str, Any]]): 73 """A wrapper around a polars DataFrame to provide a dict-like interface""" 74 75 df: pl.DataFrame 76 77 def __getitem__(self, key: int) -> dict[str, Any]: 78 return self.df.row(key, named=True) 79 80 def __iter__(self) -> Iterator[int]: 81 return iter(range(self.df.shape[0])) 82 83 def __len__(self) -> int: 84 return self.df.shape[0] 85 86 87@dataclass(slots=True, frozen=True) 88class py(ConversionFunc[str, Any]): 89 """Reads a Python file and loads the object from it.""" 90 91 def __call__(self, source: str) -> Any: 92 return load_object(source) 93 94 95@dataclass(slots=True, frozen=True) 96class csv(ConversionFunc[Iterable[str] | ReadableType, dict[int, dict[str, str]]]): 97 """Reads a csv file and converts it into a dict representation""" 98 99 def __call__( 100 self, source: Iterable[str] | ReadableType 101 ) -> dict[int, dict[str, str]]: 102 if isinstance(source, ReadableType): 103 source = read(source).splitlines() 104 105 reader = csvlib.DictReader(source) # pyright: ignore 106 data: dict[int, dict[str, str]] = {} 107 row: dict[str, str] 108 109 for idx, row in enumerate(reader): 110 data[idx] = row 111 112 return data 113 114 115@dataclass(slots=True, frozen=True) 116class json(ConversionFunc[ReadableType, dict[Any, Any]]): 117 """Reads a json file and converts it into a dict representation""" 118 119 def __call__(self, source: ReadableType) -> dict[Any, Any]: 120 data = orjson.loads(read(source)) 121 122 if isinstance(data, list): 123 return dict(enumerate(data)) 124 elif isinstance(data, dict): 125 return data 126 127 raise TypeError(f"Invalid data type: {type(data)}") 128 129 130@dataclass(slots=True, frozen=True) 131class toml(ConversionFunc[ReadableType, dict[str, Any]]): 132 """Reads a toml file and converts it into a dict representation""" 133 134 def __call__(self, source: ReadableType) -> dict[str, Any]: 135 return rtoml.loads(read(source)) 136 137 138@dataclass(slots=True, frozen=True) 139class yaml(ConversionFunc[ReadableType, dict[Any, Any]]): 140 """Reads a yaml file and converts it into a dict representation""" 141 142 def __call__(self, source: ReadableType) -> dict[Any, Any]: 143 data: dict[Any, Any] = {} 144 145 for doc_idx, doc in enumerate(yamllib.safe_load_all(source)): 146 if isinstance(doc, list): 147 for idx, item in enumerate(doc): 148 data[doc_idx + idx] = item 149 elif isinstance(doc, dict): 150 data |= doc 151 else: 152 raise TypeError(f"Invalid document type: {type(doc)}") 153 154 return data 155 156 157@dataclass(slots=True, frozen=True) 158class xml(ConversionFunc[ReadableType, dict[str, Any]]): 159 """Reads a xml file and converts it into a dict representation""" 160 161 def __call__(self, source: ReadableType) -> dict[str, Any]: 162 data = xmltodict.parse(read(source)) 163 164 if len(data) == 1: 165 data_without_root = data[next(iter(data))] 166 167 return data_without_root 168 169 return data 170 171 172@dataclass(slots=True, frozen=True) 173class txt(ConversionFunc[ReadableType, str]): 174 """Reads a text file and converts it into a string""" 175 176 def __call__(self, source: ReadableType) -> str: 177 return read(source) 178 179 180def _csv_polars(source: Path | ReadableType) -> Mapping[int, dict[str, Any]]: 181 return polars(pl.read_csv(source)) 182 183 184StructuredLoader = Callable[[AnyIO], Mapping[Any, Any]] 185AnyLoader = Callable[[AnyIO], Any] 186 187structured_loaders: dict[str, StructuredLoader] = { 188 ".json": json(), 189 ".toml": toml(), 190 ".yaml": yaml(), 191 ".yml": yaml(), 192 ".xml": xml(), 193 ".csv": _csv_polars, 194} 195 196any_loaders: dict[str, AnyLoader] = { 197 **structured_loaders, 198 ".txt": txt(), 199} 200 201 202def path( 203 path: FilePath, pattern: str | None = None, loader: AnyLoader | None = None 204) -> Casebase[Any, Any]: 205 """Converts a path into a Casebase. The path can be a directory or a file. 206 207 Args: 208 path: Path of the file. 209 210 Returns: 211 Returns a Casebase. 212 213 Examples: 214 >>> file_path = "./data/cars-1k.csv" 215 >>> result = path(file_path) 216 """ 217 if isinstance(path, str): 218 path = Path(path) 219 220 if path.is_file(): 221 return file(path, loader) 222 elif path.is_dir(): 223 return directory(path, pattern) 224 225 raise FileNotFoundError(path) 226 227 228def file(path: FilePath, loader: StructuredLoader | None = None) -> Casebase[Any, Any]: 229 """Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml. 230 231 Args: 232 path: Path of the file. 233 234 Returns: 235 Returns a Casebase. 236 237 Examples: 238 >>> from pathlib import Path 239 >>> file_path = Path("./data/cars-1k.csv") 240 >>> result = file(file_path) 241 242 """ 243 if isinstance(path, str): 244 path = Path(path) 245 246 if loader is None and path.suffix not in structured_loaders: 247 raise ValueError(f"Unsupported file type: {path.suffix}") 248 249 if loader is None: 250 loader = structured_loaders[path.suffix] 251 252 with path.open("rb") as fp: 253 return loader(fp) 254 255 256def directory(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]: 257 """Converts the files of a directory into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml. 258 259 Args: 260 path: Path of the directory. 261 pattern: Relative pattern for the files. 262 263 Returns: 264 Returns a Casebase. 265 266 Examples: 267 >>> from pathlib import Path 268 >>> directory_path = Path("./data") 269 >>> result = directory(directory_path, "*.csv") 270 >>> assert result is not None 271 """ 272 cb: Casebase[Any, Any] = {} 273 274 if isinstance(path, str): 275 path = Path(path) 276 277 for elem in path.glob(pattern or "*"): 278 if elem.is_file() and elem.suffix in any_loaders: 279 loader = any_loaders[elem.suffix] 280 281 with elem.open("rb") as fp: 282 cb[elem.stem] = loader(fp) 283 284 return cb 285 286 287def validate[K, V: BaseModel]( 288 casebase: Casebase[K, Any], model: type[V] 289) -> Casebase[K, V]: 290 """Validates the casebase against a Pydantic model. 291 292 Args: 293 casebase: Casebase where the values are the data to validate. 294 model: Pydantic model to validate the data. 295 296 Examples: 297 >>> from pydantic import BaseModel, NonNegativeInt 298 >>> from typing import Literal 299 >>> class Car(BaseModel): 300 ... price: NonNegativeInt 301 ... year: NonNegativeInt 302 ... manufacturer: str 303 ... make: str 304 ... fuel: Literal["gas", "diesel"] 305 ... miles: NonNegativeInt 306 ... title_status: Literal["clean", "rebuilt"] 307 ... transmission: Literal["automatic", "manual"] 308 ... drive: Literal["fwd", "rwd", "4wd"] 309 ... type: str 310 ... paint_color: str 311 >>> data = file("data/cars-1k.csv") 312 >>> casebase = validate(data, Car) 313 >>> data = polars(pl.read_csv("data/cars-1k.csv")) 314 >>> casebase = validate(data, Car) 315 """ 316 317 return {key: model.model_validate(value) for key, value in casebase.items()}
203def path( 204 path: FilePath, pattern: str | None = None, loader: AnyLoader | None = None 205) -> Casebase[Any, Any]: 206 """Converts a path into a Casebase. The path can be a directory or a file. 207 208 Args: 209 path: Path of the file. 210 211 Returns: 212 Returns a Casebase. 213 214 Examples: 215 >>> file_path = "./data/cars-1k.csv" 216 >>> result = path(file_path) 217 """ 218 if isinstance(path, str): 219 path = Path(path) 220 221 if path.is_file(): 222 return file(path, loader) 223 elif path.is_dir(): 224 return directory(path, pattern) 225 226 raise FileNotFoundError(path)
Converts a path into a Casebase. The path can be a directory or a file.
Arguments:
- path: Path of the file.
Returns:
Returns a Casebase.
Examples:
>>> file_path = "./data/cars-1k.csv" >>> result = path(file_path)
229def file(path: FilePath, loader: StructuredLoader | None = None) -> Casebase[Any, Any]: 230 """Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml. 231 232 Args: 233 path: Path of the file. 234 235 Returns: 236 Returns a Casebase. 237 238 Examples: 239 >>> from pathlib import Path 240 >>> file_path = Path("./data/cars-1k.csv") 241 >>> result = file(file_path) 242 243 """ 244 if isinstance(path, str): 245 path = Path(path) 246 247 if loader is None and path.suffix not in structured_loaders: 248 raise ValueError(f"Unsupported file type: {path.suffix}") 249 250 if loader is None: 251 loader = structured_loaders[path.suffix] 252 253 with path.open("rb") as fp: 254 return loader(fp)
Converts a file into a Casebase. The file can be of type csv, json, toml, yaml, or yml.
Arguments:
- path: Path of the file.
Returns:
Returns a Casebase.
Examples:
>>> from pathlib import Path >>> file_path = Path("./data/cars-1k.csv") >>> result = file(file_path)
257def directory(path: FilePath, pattern: str | None = None) -> Casebase[Any, Any]: 258 """Converts the files of a directory into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml. 259 260 Args: 261 path: Path of the directory. 262 pattern: Relative pattern for the files. 263 264 Returns: 265 Returns a Casebase. 266 267 Examples: 268 >>> from pathlib import Path 269 >>> directory_path = Path("./data") 270 >>> result = directory(directory_path, "*.csv") 271 >>> assert result is not None 272 """ 273 cb: Casebase[Any, Any] = {} 274 275 if isinstance(path, str): 276 path = Path(path) 277 278 for elem in path.glob(pattern or "*"): 279 if elem.is_file() and elem.suffix in any_loaders: 280 loader = any_loaders[elem.suffix] 281 282 with elem.open("rb") as fp: 283 cb[elem.stem] = loader(fp) 284 285 return cb
Converts the files of a directory into a Casebase. The files can be of type txt, csv, json, toml, yaml, or yml.
Arguments:
- path: Path of the directory.
- pattern: Relative pattern for the files.
Returns:
Returns a Casebase.
Examples:
>>> from pathlib import Path >>> directory_path = Path("./data") >>> result = directory(directory_path, "*.csv") >>> assert result is not None
288def validate[K, V: BaseModel]( 289 casebase: Casebase[K, Any], model: type[V] 290) -> Casebase[K, V]: 291 """Validates the casebase against a Pydantic model. 292 293 Args: 294 casebase: Casebase where the values are the data to validate. 295 model: Pydantic model to validate the data. 296 297 Examples: 298 >>> from pydantic import BaseModel, NonNegativeInt 299 >>> from typing import Literal 300 >>> class Car(BaseModel): 301 ... price: NonNegativeInt 302 ... year: NonNegativeInt 303 ... manufacturer: str 304 ... make: str 305 ... fuel: Literal["gas", "diesel"] 306 ... miles: NonNegativeInt 307 ... title_status: Literal["clean", "rebuilt"] 308 ... transmission: Literal["automatic", "manual"] 309 ... drive: Literal["fwd", "rwd", "4wd"] 310 ... type: str 311 ... paint_color: str 312 >>> data = file("data/cars-1k.csv") 313 >>> casebase = validate(data, Car) 314 >>> data = polars(pl.read_csv("data/cars-1k.csv")) 315 >>> casebase = validate(data, Car) 316 """ 317 318 return {key: model.model_validate(value) for key, value in casebase.items()}
Validates the casebase against a Pydantic model.
Arguments:
- casebase: Casebase where the values are the data to validate.
- model: Pydantic model to validate the data.
Examples:
>>> from pydantic import BaseModel, NonNegativeInt >>> from typing import Literal >>> class Car(BaseModel): ... price: NonNegativeInt ... year: NonNegativeInt ... manufacturer: str ... make: str ... fuel: Literal["gas", "diesel"] ... miles: NonNegativeInt ... title_status: Literal["clean", "rebuilt"] ... transmission: Literal["automatic", "manual"] ... drive: Literal["fwd", "rwd", "4wd"] ... type: str ... paint_color: str >>> data = file("data/cars-1k.csv") >>> casebase = validate(data, Car) >>> data = polars(pl.read_csv("data/cars-1k.csv")) >>> casebase = validate(data, Car)
96@dataclass(slots=True, frozen=True) 97class csv(ConversionFunc[Iterable[str] | ReadableType, dict[int, dict[str, str]]]): 98 """Reads a csv file and converts it into a dict representation""" 99 100 def __call__( 101 self, source: Iterable[str] | ReadableType 102 ) -> dict[int, dict[str, str]]: 103 if isinstance(source, ReadableType): 104 source = read(source).splitlines() 105 106 reader = csvlib.DictReader(source) # pyright: ignore 107 data: dict[int, dict[str, str]] = {} 108 row: dict[str, str] 109 110 for idx, row in enumerate(reader): 111 data[idx] = row 112 113 return data
Reads a csv file and converts it into a dict representation
116@dataclass(slots=True, frozen=True) 117class json(ConversionFunc[ReadableType, dict[Any, Any]]): 118 """Reads a json file and converts it into a dict representation""" 119 120 def __call__(self, source: ReadableType) -> dict[Any, Any]: 121 data = orjson.loads(read(source)) 122 123 if isinstance(data, list): 124 return dict(enumerate(data)) 125 elif isinstance(data, dict): 126 return data 127 128 raise TypeError(f"Invalid data type: {type(data)}")
Reads a json file and converts it into a dict representation
72@dataclass(slots=True, frozen=True) 73class polars(Mapping[int, dict[str, Any]]): 74 """A wrapper around a polars DataFrame to provide a dict-like interface""" 75 76 df: pl.DataFrame 77 78 def __getitem__(self, key: int) -> dict[str, Any]: 79 return self.df.row(key, named=True) 80 81 def __iter__(self) -> Iterator[int]: 82 return iter(range(self.df.shape[0])) 83 84 def __len__(self) -> int: 85 return self.df.shape[0]
A wrapper around a polars DataFrame to provide a dict-like interface
53@dataclass(slots=True, frozen=True) 54class pandas(Mapping[int, pd.Series]): 55 """A wrapper around a pandas DataFrame to provide a dict-like interface""" 56 57 df: pd.DataFrame 58 59 def __getitem__(self, key: int | str) -> pd.Series: 60 if isinstance(key, str): 61 return cast(pd.Series, self.df.loc[key]) 62 63 return cast(pd.Series, self.df.iloc[key]) 64 65 def __iter__(self) -> Iterator[int]: 66 return iter(range(self.df.shape[0])) 67 68 def __len__(self) -> int: 69 return self.df.shape[0]
A wrapper around a pandas DataFrame to provide a dict-like interface
88@dataclass(slots=True, frozen=True) 89class py(ConversionFunc[str, Any]): 90 """Reads a Python file and loads the object from it.""" 91 92 def __call__(self, source: str) -> Any: 93 return load_object(source)
Reads a Python file and loads the object from it.
131@dataclass(slots=True, frozen=True) 132class toml(ConversionFunc[ReadableType, dict[str, Any]]): 133 """Reads a toml file and converts it into a dict representation""" 134 135 def __call__(self, source: ReadableType) -> dict[str, Any]: 136 return rtoml.loads(read(source))
Reads a toml file and converts it into a dict representation
173@dataclass(slots=True, frozen=True) 174class txt(ConversionFunc[ReadableType, str]): 175 """Reads a text file and converts it into a string""" 176 177 def __call__(self, source: ReadableType) -> str: 178 return read(source)
Reads a text file and converts it into a string
158@dataclass(slots=True, frozen=True) 159class xml(ConversionFunc[ReadableType, dict[str, Any]]): 160 """Reads a xml file and converts it into a dict representation""" 161 162 def __call__(self, source: ReadableType) -> dict[str, Any]: 163 data = xmltodict.parse(read(source)) 164 165 if len(data) == 1: 166 data_without_root = data[next(iter(data))] 167 168 return data_without_root 169 170 return data
Reads a xml file and converts it into a dict representation
139@dataclass(slots=True, frozen=True) 140class yaml(ConversionFunc[ReadableType, dict[Any, Any]]): 141 """Reads a yaml file and converts it into a dict representation""" 142 143 def __call__(self, source: ReadableType) -> dict[Any, Any]: 144 data: dict[Any, Any] = {} 145 146 for doc_idx, doc in enumerate(yamllib.safe_load_all(source)): 147 if isinstance(doc, list): 148 for idx, item in enumerate(doc): 149 data[doc_idx + idx] = item 150 elif isinstance(doc, dict): 151 data |= doc 152 else: 153 raise TypeError(f"Invalid document type: {type(doc)}") 154 155 return data
Reads a yaml file and converts it into a dict representation