diff --git a/Cargo.lock b/Cargo.lock index a6ffb82..d796aee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -351,6 +351,25 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -400,6 +419,8 @@ checksum = "91fd8e38a3b50ed1167fb981cd6fd60147e091784c427b8f7183a7ee32c31c12" dependencies = [ "chrono", "libc", + "num-bigint", + "num-traits", "once_cell", "portable-atomic", "pyo3-build-config", @@ -467,6 +488,8 @@ version = "0.6.2" dependencies = [ "calamine", "chrono", + "num-bigint", + "num-traits", "pyo3", "pyo3-build-config", "pyo3-file", diff --git a/Cargo.toml b/Cargo.toml index 6f70163..2012275 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,9 +13,12 @@ pyo3 = { version = "0.28.2", features = [ "extension-module", "chrono", "generate-import-lib", + "num-bigint", ] } chrono = { version = "0.4.42", features = ["serde"] } pyo3-file = { git = "https://github.com/dimastbk/pyo3-file.git", rev = "b0a5b260d13c6df14ea021b06c071a6f389fa805" } +num-bigint = "0.4.6" +num-traits = "0.2.19" [build-dependencies] pyo3-build-config = "0.28.2" diff --git a/python/python_calamine/_python_calamine.pyi b/python/python_calamine/_python_calamine.pyi index 536c989..f5ef11c 100644 --- a/python/python_calamine/_python_calamine.pyi +++ b/python/python_calamine/_python_calamine.pyi @@ -74,6 +74,28 @@ class CalamineSheet: def end(self) -> tuple[int, int] | None: """Get bottom right cell position of a sheet data.""" + def to_python_pandas( + self, skip_empty_area: bool = True, nrows: int | None = None + ) -> list[ + list[ + int + | float + | str + | bool + | datetime.time + | datetime.date + | datetime.datetime + | datetime.timedelta + ] + ]: + """Returning data from sheet as list of lists with pandas style coercion. + + Args: + skip_empty_area (bool): + By default, calamine skips empty rows/cols before data. + For suppress this behaviour, set `skip_empty_area` to `False`. + """ + def to_python( self, skip_empty_area: bool = True, nrows: int | None = None ) -> list[ diff --git a/src/types/cell.rs b/src/types/cell.rs index 871ff93..42f0eb9 100644 --- a/src/types/cell.rs +++ b/src/types/cell.rs @@ -1,7 +1,9 @@ use std::convert::From; -use calamine::DataType; +use calamine::{Data, DataType}; use chrono::Datelike; +use num_bigint::BigInt; +use num_traits::FromPrimitive; use pyo3::prelude::*; /// https://docs.python.org/3/library/datetime.html#constants @@ -13,6 +15,7 @@ const MAXYEAR: i32 = 9999; #[derive(Debug, Clone)] pub enum CellValue { + BigInt(BigInt), Int(i64), Float(f64), String(String), @@ -32,6 +35,54 @@ fn check_year_range(value: DT) -> Option
{ } } +pub fn convert_to_pandas_cell(data: &Data) -> CellValue { + match data { + // # GH#54564 + // # pandas casts x.0 floats to x int + Data::Float(f) => { + if f.is_finite() && !f.is_nan() && f.fract() == 0. { + if *f >= i64::MIN as f64 && *f < i64::MAX as f64 { + CellValue::Int(*f as i64) + } else { + CellValue::BigInt(BigInt::from_f64(*f).unwrap()) + } + } else { + data.into() + } + } + // Return timedeltas and datetimes as-is to match openpyxl behavior (GH#59186) + Data::DateTime(dt) => { + let v = dt.as_f64(); + if dt.is_duration() { + data.as_duration().map(CellValue::Timedelta) + } else if v < 1.0 { + data.as_time().map(CellValue::Time) + } else { + data.as_datetime() + .and_then(check_year_range) + .map(CellValue::DateTime) + } + .unwrap_or(CellValue::Float(v)) + } + Data::DateTimeIso(v) => { + if v.contains('T') { + data.as_datetime() + .and_then(check_year_range) + .map(CellValue::DateTime) + } else if v.contains(':') { + data.as_time().map(CellValue::Time) + } else { + data.as_date() + .and_then(check_year_range) + .and_then(|date| date.and_hms_opt(0, 0, 0)) + .map(CellValue::DateTime) + } + } + .unwrap_or(CellValue::String(v.to_owned())), + _ => data.into(), + } +} + impl<'py> IntoPyObject<'py> for CellValue { type Target = PyAny; type Output = Bound<'py, Self::Target>; @@ -39,6 +90,7 @@ impl<'py> IntoPyObject<'py> for CellValue { fn into_pyobject(self, py: Python<'py>) -> Result { match self { + CellValue::BigInt(v) => Ok(v.into_pyobject(py)?.into_any()), CellValue::Int(v) => Ok(v.into_pyobject(py)?.into_any()), CellValue::Float(v) => Ok(v.into_pyobject(py)?.into_any()), CellValue::String(v) => Ok(v.into_pyobject(py)?.into_any()), diff --git a/src/types/sheet.rs b/src/types/sheet.rs index dd237dd..fdc86bb 100644 --- a/src/types/sheet.rs +++ b/src/types/sheet.rs @@ -6,6 +6,7 @@ use pyo3::class::basic::CompareOp; use pyo3::prelude::*; use pyo3::types::PyList; +use crate::types::cell::convert_to_pandas_cell; use crate::CellValue; #[pyclass(eq, eq_int, from_py_object)] @@ -187,6 +188,39 @@ impl CalamineSheet { self.range.end() } + #[pyo3(signature = (skip_empty_area=true, nrows=None))] + fn to_python_pandas( + slf: PyRef<'_, Self>, + skip_empty_area: bool, + nrows: Option, + ) -> PyResult> { + let nrows = match nrows { + Some(nrows) => nrows, + None => slf.range.end().map_or(0, |end| end.0 + 1), + }; + + let range = if skip_empty_area || Some((0, 0)) == slf.range.start() { + Arc::clone(&slf.range) + } else if let Some(end) = slf.range.end() { + Arc::new(slf.range.range( + (0, 0), + (if nrows > end.0 { end.0 } else { nrows - 1 }, end.1), + )) + } else { + Arc::clone(&slf.range) + }; + + let py_list = PyList::empty(slf.py()); + + for row in range.rows().take(nrows as usize) { + let py_row = PyList::new(slf.py(), row.iter().map(convert_to_pandas_cell))?; + + py_list.append(py_row)?; + } + + Ok(py_list) + } + #[pyo3(signature = (skip_empty_area=true, nrows=None))] fn to_python( slf: PyRef<'_, Self>, diff --git a/tests/data/large_integer_pandas.ods b/tests/data/large_integer_pandas.ods new file mode 100644 index 0000000..40c7953 Binary files /dev/null and b/tests/data/large_integer_pandas.ods differ diff --git a/tests/data/large_integer_pandas.xlsx b/tests/data/large_integer_pandas.xlsx new file mode 100644 index 0000000..028b0a4 Binary files /dev/null and b/tests/data/large_integer_pandas.xlsx differ diff --git a/tests/test_pandas_bypass.py b/tests/test_pandas_bypass.py new file mode 100644 index 0000000..7170e56 --- /dev/null +++ b/tests/test_pandas_bypass.py @@ -0,0 +1,94 @@ +from datetime import date, datetime, time, timedelta +from pathlib import Path + +from python_calamine import CalamineWorkbook + +PATH = Path(__file__).parent / "data" + + +def __old_convert_cell(value): + if isinstance(value, float): + # GH#54564 - is_integer() returns False for NaN/Inf, + # so this safely avoids int() on non-finite values + if value.is_integer(): + return int(value) + return value + elif isinstance(value, (datetime, timedelta)): + # Return as-is to match openpyxl behavior (GH#59186) + return value + elif isinstance(value, date): + # Convert date to datetime to match openpyxl behavior (GH#59186) + return datetime(value.year, value.month, value.day) + elif isinstance(value, time): + return value + return value + + +def test_old_pandas_ods_large_integer(): + sheet = CalamineWorkbook.from_object( + PATH / "large_integer_pandas.ods" + ).get_sheet_by_index(0) + + old_data = [[__old_convert_cell(y) for y in x] for x in sheet.to_python()] + new_data = sheet.to_python_pandas() + + assert old_data == new_data + + +def test_old_pandas_xlsx_large_integer(): + sheet = CalamineWorkbook.from_object( + PATH / "large_integer_pandas.xlsx" + ).get_sheet_by_index(0) + + old_data = [[__old_convert_cell(y) for y in x] for x in sheet.to_python()] + new_data = sheet.to_python_pandas() + + assert old_data == new_data + + +def test_old_pandas_xlsx(): + sheet_names = ["Sheet1", "Sheet2", "Merged Cells"] + wb = CalamineWorkbook.from_object(PATH / "base.xlsx") + + for sheet_name in sheet_names: + sheet = wb.get_sheet_by_name(sheet_name) + old_data = [[__old_convert_cell(y) for y in x] for x in sheet.to_python()] + new_data = sheet.to_python_pandas() + + assert old_data == new_data + + +def test_old_pandas_xls(): + sheet_names = ["Sheet1", "Sheet2", "Merged Cells"] + wb = CalamineWorkbook.from_object(PATH / "base.xls") + + for sheet_name in sheet_names: + sheet = wb.get_sheet_by_name(sheet_name) + old_data = [[__old_convert_cell(y) for y in x] for x in sheet.to_python()] + new_data = sheet.to_python_pandas() + + assert old_data == new_data + + +def test_old_pandas_xlsb(): + sheet_names = ["Sheet1", "Sheet2", "Merged Cells"] + wb = CalamineWorkbook.from_object(PATH / "base.xlsb") + + for sheet_name in sheet_names: + sheet = wb.get_sheet_by_name(sheet_name) + old_data = [[__old_convert_cell(y) for y in x] for x in sheet.to_python()] + new_data = sheet.to_python_pandas() + + assert old_data == new_data + + +def test_old_pandas_ods(): + sheet_names = ["Sheet1", "Sheet2", "Merged Cells"] + wb = CalamineWorkbook.from_object(PATH / "base.ods") + + for sheet_name in sheet_names: + sheet = wb.get_sheet_by_name(sheet_name) + old_data = [[__old_convert_cell(y) for y in x] for x in sheet.to_python()] + new_data = sheet.to_python_pandas() + + assert old_data == new_data