|
6 | 6 |
|
7 | 7 | import duckdb as ddb |
8 | 8 | import polars as pl |
9 | | -from duckdb import DuckDBPyConnection, DuckDBPyRelation, default_connection, read_csv |
| 9 | +from duckdb import DuckDBPyConnection, DuckDBPyRelation, StarExpression, default_connection, read_csv |
10 | 10 | from pydantic import BaseModel |
11 | 11 |
|
12 | 12 | from dve.core_engine.backends.base.reader import BaseFileReader, read_function |
13 | 13 | from dve.core_engine.backends.exceptions import EmptyFileError, MessageBearingError |
14 | 14 | from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import ( |
| 15 | + duckdb_record_index, |
15 | 16 | duckdb_write_parquet, |
16 | 17 | get_duckdb_type_from_annotation, |
17 | 18 | ) |
18 | 19 | from dve.core_engine.backends.implementations.duckdb.types import SQLType |
19 | 20 | from dve.core_engine.backends.readers.utilities import check_csv_header_expected |
20 | | -from dve.core_engine.backends.utilities import get_polars_type_from_annotation |
| 21 | +from dve.core_engine.backends.utilities import get_polars_type_from_annotation, polars_record_index |
| 22 | +from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME |
21 | 23 | from dve.core_engine.message import FeedbackMessage |
22 | 24 | from dve.core_engine.type_hints import URI, EntityName |
23 | 25 | from dve.parser.file_handling import get_content_length |
24 | 26 |
|
25 | | - |
| 27 | +@duckdb_record_index |
26 | 28 | @duckdb_write_parquet |
27 | 29 | class DuckDBCSVReader(BaseFileReader): |
28 | 30 | """A reader for CSV files including the ability to compare the passed model |
@@ -109,9 +111,9 @@ def read_to_relation( # pylint: disable=unused-argument |
109 | 111 | } |
110 | 112 |
|
111 | 113 | reader_options["columns"] = ddb_schema |
112 | | - return read_csv(resource, **reader_options) |
113 | | - |
| 114 | + return self.add_record_index(read_csv(resource, **reader_options, parallel=False)) |
114 | 115 |
|
| 116 | +@polars_record_index |
115 | 117 | class PolarsToDuckDBCSVReader(DuckDBCSVReader): |
116 | 118 | """ |
117 | 119 | Utilises the polars lazy csv reader which is then converted into a DuckDBPyRelation object. |
@@ -142,10 +144,11 @@ def read_to_relation( # pylint: disable=unused-argument |
142 | 144 | for fld in schema.__fields__.values() |
143 | 145 | } |
144 | 146 | reader_options["dtypes"] = polars_types |
| 147 | + |
145 | 148 |
|
146 | 149 | # there is a raise_if_empty arg for 0.18+. Future reference when upgrading. Makes L85 |
147 | 150 | # redundant |
148 | | - df = pl.scan_csv(resource, **reader_options).select(list(polars_types.keys())) # type: ignore # pylint: disable=W0612 |
| 151 | + df = self.add_record_index(pl.scan_csv(resource, **reader_options).select(list(polars_types.keys()))) # type: ignore # pylint: disable=W0612 |
149 | 152 |
|
150 | 153 | return ddb.sql("SELECT * FROM df") |
151 | 154 |
|
@@ -189,8 +192,8 @@ def __init__( |
189 | 192 | def read_to_relation( # pylint: disable=unused-argument |
190 | 193 | self, resource: URI, entity_name: EntityName, schema: type[BaseModel] |
191 | 194 | ) -> DuckDBPyRelation: |
192 | | - entity = super().read_to_relation(resource=resource, entity_name=entity_name, schema=schema) |
193 | | - entity = entity.distinct() |
| 195 | + entity: DuckDBPyRelation = super().read_to_relation(resource=resource, entity_name=entity_name, schema=schema) |
| 196 | + entity = entity.select(StarExpression(exclude=[RECORD_INDEX_COLUMN_NAME])).distinct() |
194 | 197 | no_records = entity.shape[0] |
195 | 198 |
|
196 | 199 | if no_records != 1: |
@@ -219,4 +222,4 @@ def read_to_relation( # pylint: disable=unused-argument |
219 | 222 | ], |
220 | 223 | ) |
221 | 224 |
|
222 | | - return entity |
| 225 | + return entity.select(f"*, 1 as {RECORD_INDEX_COLUMN_NAME}") |
0 commit comments