diff --git a/CHANGES.md b/CHANGES.md index 9e786442..3823f20b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,9 +2,14 @@ ## 0.12.0 (yyyy-mm-dd) +### Improvements + +- Add `datetimes` parameter to `read_dataframe` to choose the way datetime columns are + returned + several fixes when reading and writing datetimes (#486). + ### Bug fixes -- Fix wrong layername when creating .gpkg.zip file (#570) +- Fix wrong layername when creating .gpkg.zip file (#570). ## 0.11.1 (2025-08-02) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index d9d02024..a6434523 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -937,10 +937,16 @@ cdef process_fields( if datetime_as_string: # defer datetime parsing to user/ pandas layer - # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only - data[i] = get_string( - OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding - ) + IF CTE_GDAL_VERSION >= (3, 7, 0): + data[i] = get_string( + OGR_F_GetFieldAsISO8601DateTime(ogr_feature, field_index, NULL), + encoding=encoding, + ) + ELSE: + data[i] = get_string( + OGR_F_GetFieldAsString(ogr_feature, field_index), + encoding=encoding, + ) else: success = OGR_F_GetFieldAsDateTimeEx( ogr_feature, @@ -1503,6 +1509,7 @@ def ogr_open_arrow( int return_fids=False, int batch_size=0, use_pyarrow=False, + datetime_as_string=False, ): cdef int err = 0 @@ -1723,6 +1730,12 @@ def ogr_open_arrow( "GEOARROW".encode("UTF-8") ) + # Read DateTime fields as strings, as the Arrow DateTime column type is + # quite limited regarding support for mixed timezones,... + IF CTE_GDAL_VERSION >= (3, 11, 0): + if datetime_as_string: + options = CSLSetNameValue(options, "DATETIME_AS_STRING", "YES") + # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1750,6 +1763,7 @@ def ogr_open_arrow( "crs": crs, "encoding": encoding, "fields": fields[:, 2], + "dtypes": fields[:, 3], "geometry_type": geometry_type, "geometry_name": geometry_name, "fid_column": fid_column, diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index ca400f6a..4d07b2f4 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -415,6 +415,14 @@ IF CTE_GDAL_VERSION >= (3, 6, 0): ) +IF CTE_GDAL_VERSION >= (3, 7, 0): + + cdef extern from "ogr_api.h": + const char* OGR_F_GetFieldAsISO8601DateTime( + OGRFeatureH feature, int n, char** papszOptions + ) + + IF CTE_GDAL_VERSION >= (3, 8, 0): cdef extern from "ogr_api.h": diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index ce57575b..17eece5f 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -2,6 +2,7 @@ import os import warnings +from datetime import datetime import numpy as np @@ -12,6 +13,7 @@ PANDAS_GE_22, PANDAS_GE_30, PYARROW_GE_19, + __gdal_version__, ) from pyogrio.errors import DataSourceError from pyogrio.raw import ( @@ -37,33 +39,98 @@ def _stringify_path(path): return path -def _try_parse_datetime(ser): +def _try_parse_datetime(ser, datetimes): import pandas as pd # only called when pandas is known to be installed + from pandas.api.types import is_string_dtype + + datetimes = datetimes.upper() + datetimes_values = [ + "MIXED_TO_UTC", + "MIXED_TO_DATETIME", + "STRING", + ] + datetime_kwargs = {} + if datetimes == "STRING": + if not is_string_dtype(ser.dtype): + # Support to return datetimes as strings using arrow only available for + # GDAL >= 3.11, so convert to string here if needed. + res = ser.astype("string").str.replace(" ", "T") + return res + if __gdal_version__ < (3, 7, 0): + # GDAL < 3.7 doesn't return datetimes in ISO8601 format, so fix that + return ser.str.replace(" ", "T").str.replace("/", "-") + return ser + elif datetimes in datetimes_values: + pass + else: + raise ValueError( + f"Invalid value for 'datetimes': {datetimes!r}. " + f"Must be one of {datetimes_values!r}." + ) if PANDAS_GE_22: - datetime_kwargs = {"format": "ISO8601"} + datetime_kwargs["format"] = "ISO8601" elif PANDAS_GE_20: - datetime_kwargs = {"format": "ISO8601", "errors": "ignore"} + datetime_kwargs["format"] = "ISO8601" + datetime_kwargs["errors"] = "ignore" else: - datetime_kwargs = {"yearfirst": True} + datetime_kwargs["yearfirst"] = True + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time zones will raise.*", FutureWarning, ) - # pre-emptive try catch for when pandas will raise - # (can tighten the exception type in future when it does) + + warning = "Error parsing datetimes, original strings are returned: {message}" try: res = pd.to_datetime(ser, **datetime_kwargs) - except Exception: - res = ser - # if object dtype, try parse as utc instead - if res.dtype in ("object", "string"): + + # With pandas >2 and <3, mixed timezones were returned as pandas Timestamps, + # so convert them to datetime objects. + if ( + datetimes == "MIXED_TO_DATETIME" + and PANDAS_GE_20 + and res.dtype == "object" + ): + res = res.map(lambda x: x.to_pydatetime(), na_action="ignore") + + except Exception as ex: + if isinstance(ex, ValueError) and "Mixed timezones detected" in str(ex): + # Parsing mixed timezones with to_datetime is not supported + # anymore in pandas >= 3.0, leading to a ValueError. + if datetimes == "MIXED_TO_DATETIME": + # Using map seems to be the fastest way to convert the strings to + # datetimes. + try: + res = ser.map(datetime.fromisoformat, na_action="ignore") + except Exception as ex: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + elif datetimes == "MIXED_TO_UTC": + # Convert mixed timezone datetimes to UTC. + try: + res = pd.to_datetime(ser, utc=True, **datetime_kwargs) + except Exception as ex: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + else: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + else: + # If the error is not related to mixed timezones, log it and return + # the original series. + warnings.warn(warning.format(message=str(ex)), stacklevel=1) + return ser + + # For pandas < 3.0, to_datetime converted mixed timezone data to datetime objects. + # For this datetimes option they should be converted to UTC though... + if datetimes == "MIXED_TO_UTC" and res.dtype in ("object", "string"): try: res = pd.to_datetime(ser, utc=True, **datetime_kwargs) - except Exception: - pass + except Exception as ex: + warnings.warn(warning.format(message=str(ex)), stacklevel=1) if res.dtype.kind == "M": # any datetime64 # GDAL only supports ms precision, convert outputs to match. @@ -73,6 +140,7 @@ def _try_parse_datetime(ser): res = res.dt.as_unit("ms") else: res = res.dt.round(freq="ms") + return res @@ -96,6 +164,7 @@ def read_dataframe( use_arrow=None, on_invalid="raise", arrow_to_pandas_kwargs=None, + datetimes="MIXED_TO_UTC", **kwargs, ): """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame. @@ -223,8 +292,31 @@ def read_dataframe( arrow_to_pandas_kwargs : dict, optional (default: None) When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_ call for the arrow to pandas conversion. + datetimes : str, optional (default: "UTC") + The way datetime columns should be returned. Possible values: + + - **"MIXED_TO_UTC"**: return all datetime columns as pandas datetime64 columns. + The data is returned as-is if a column contains only naive datetimes + (without timezone information), only UTC datetimes, or if all datetimes + in the column have the same timezone offset. + Note that in timezones with daylight saving time datetimes will have + different offsets throughout the year! + For columns that don't comply to the above, all datetimes are converted + to UTC. In that case naive datetimes are assumed to be in UTC already. + - **"MIXED_TO_DATETIME"**: return datetimes in the timezone as they were read + from the data source, even if a column contains mixed timezone offsets. + Columns will be returned as pandas datetime64 column if a column contains + only naive datetimes (without timezone information), only UTC datetimes, + or if all datetimes in the column have the same timezone offset. + Note that in timezones with daylight saving time datetimes will have + different offsets throughout the year! + Columns that don't comply to the above are returned as object columns with + python datetime values. If you want to roundtrip datetimes without data + loss, this is the recommended option. + - **"STRING"**: return all datetimes as ISO8601 strings. + **kwargs - Additional driver-specific dataset open options passed to OGR. Invalid + Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. Returns @@ -267,11 +359,13 @@ def read_dataframe( read_func = read_arrow if use_arrow else read gdal_force_2d = False if use_arrow else force_2d - if not use_arrow: - # For arrow, datetimes are read as is. - # For numpy IO, datetimes are read as string values to preserve timezone info - # as numpy does not directly support timezones. - kwargs["datetime_as_string"] = True + + # Always read datetimes as string values to preserve (mixed) timezone info + # correctly. If arrow is not used, it is needed because numpy does not + # directly support timezones. If arrow is used, needed because datetime + # columns don't support mixed timezone offsets + e.g. for .fgb files + # timezone info isn't handled correctly even for unique timezone offsets + # if datetimes are not read as string. result = read_func( path_or_buffer, layer=layer, @@ -288,6 +382,7 @@ def read_dataframe( sql=sql, sql_dialect=sql_dialect, return_fids=fid_as_index, + datetime_as_string=True, **kwargs, ) @@ -330,6 +425,11 @@ def read_dataframe( del table + # convert datetime columns that were read as string to datetime + for dtype, column in zip(meta["dtypes"], meta["fields"]): + if dtype is not None and dtype.startswith("datetime"): + df[column] = _try_parse_datetime(df[column], datetimes=datetimes) + if fid_as_index: df = df.set_index(meta["fid_column"]) df.index.names = ["fid"] @@ -361,7 +461,7 @@ def read_dataframe( df = pd.DataFrame(data, columns=columns, index=index) for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): - df[c] = _try_parse_datetime(df[c]) + df[c] = _try_parse_datetime(df[c], datetimes=datetimes) if geometry is None or not read_geometry: return df @@ -584,6 +684,7 @@ def write_dataframe( crs = geometry.crs.to_wkt("WKT1_GDAL") if use_arrow: + import pandas as pd # only called when pandas is known to be installed import pyarrow as pa from pyogrio.raw import write_arrow @@ -619,8 +720,33 @@ def write_dataframe( df = pd.DataFrame(df, copy=False) df[geometry_column] = geometry + # Arrow doesn't support datetime columns with mixed timezones, and GDAL only + # supports timezone offsets. Hence, to avoid data loss, convert columns that can + # contain datetime values with different offsets to strings. + # Also pass a list of these columns on so GDAL so it can still treat them as + # datetime columns when writing the dataset. + datetime_cols = [] + for name, dtype in df.dtypes.items(): + if dtype == "object": + # An object column with datetimes can contain multiple offsets. + if pd.api.types.infer_dtype(df[name]) == "datetime": + df[name] = df[name].astype("string") + datetime_cols.append(name) + + elif isinstance(dtype, pd.DatetimeTZDtype) and str(dtype.tz) != "UTC": + # A pd.datetime64 column with a timezone different than UTC can contain + # data with different offsets because of summer/winter time. + df[name] = df[name].astype("string") + datetime_cols.append(name) + table = pa.Table.from_pandas(df, preserve_index=False) + # Add metadata to datetime columns so GDAL knows they are datetimes. + for datetime_col in datetime_cols: + table = _add_column_metadata( + table, column_metadata={datetime_col: {"GDAL:OGR:type": "DateTime"}} + ) + # Null arrow columns are not supported by GDAL, so convert to string for field_index, field in enumerate(table.schema): if field.type == pa.null(): @@ -678,6 +804,8 @@ def write_dataframe( gdal_tz_offsets = {} for name in fields: col = df[name] + values = None + if isinstance(col.dtype, pd.DatetimeTZDtype): # Deal with datetimes with timezones by passing down timezone separately # pass down naive datetime @@ -692,8 +820,24 @@ def write_dataframe( # Convert each row offset to a signed multiple of 15m and add to GMT value gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100 gdal_tz_offsets[name] = gdal_offset_representation.values - else: + + elif col.dtype == "object": + # Column of Timestamp/datetime objects, split in naive datetime and tz. + col_na = df[col.notna()][name] + if len(col_na) and all( + isinstance(x, (pd.Timestamp, datetime)) for x in col_na + ): + tz_offset = col.apply(lambda x: None if pd.isna(x) else x.utcoffset()) + gdal_offset_repr = tz_offset // pd.Timedelta("15m") + 100 + gdal_tz_offsets[name] = gdal_offset_repr.values + naive = col.apply( + lambda x: None if pd.isna(x) else x.replace(tzinfo=None) + ) + values = naive.values + + if values is None: values = col.values + if isinstance(values, pd.api.extensions.ExtensionArray): from pandas.arrays import BooleanArray, FloatingArray, IntegerArray @@ -729,3 +873,48 @@ def write_dataframe( gdal_tz_offsets=gdal_tz_offsets, **kwargs, ) + + +def _add_column_metadata(table, column_metadata: dict = {}): + """Add or update column-level metadata to an arrow table. + + Parameters + ---------- + table : pyarrow.Table + The table to add the column metadata to. + column_metadata : dict + A dictionary with column metadata in the form + { + "column_1": {"some": "data"}, + "column_2": {"more": "stuff"}, + } + + Returns + ------- + pyarrow.Table: table with the updated column metadata. + """ + import pyarrow as pa + + if not column_metadata: + return table + + # Create updated column fields with new metadata + fields = [] + for col in table.schema.names: + if col in column_metadata: + # Add/update column metadata + metadata = table.field(col).metadata or {} + for key, value in column_metadata[col].items(): + metadata[key] = value + # Update field with updated metadata + fields.append(table.field(col).with_metadata(metadata)) + else: + fields.append(table.field(col)) + + # Create new schema with the updated field metadata + schema = pa.schema(fields, metadata=table.schema.metadata) + + # Build new table with updated schema (shouldn't copy data) + table = table.cast(schema) + + return table diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 0f0c3063..09bd5aa2 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -233,6 +233,7 @@ def read_arrow( sql=None, sql_dialect=None, return_fids=False, + datetime_as_string=False, **kwargs, ): """Read OGR data source into a pyarrow Table. @@ -303,6 +304,7 @@ def read_arrow( skip_features=gdal_skip_features, batch_size=batch_size, use_pyarrow=True, + datetime_as_string=datetime_as_string, **kwargs, ) as source: meta, reader = source @@ -358,6 +360,7 @@ def open_arrow( return_fids=False, batch_size=65_536, use_pyarrow=False, + datetime_as_string=False, **kwargs, ): """Open OGR data source as a stream of Arrow record batches. @@ -386,6 +389,9 @@ def open_arrow( ArrowStream object. In the default case, this stream object needs to be passed to another library supporting the Arrow PyCapsule Protocol to consume the stream of data. + datetime_as_string : bool, optional (default: False) + If True, will return datetime dtypes as detected by GDAL as strings, + as arrow doesn't support e.g. mixed timezones. Examples -------- @@ -423,6 +429,7 @@ def open_arrow( Meta is: { "crs": "", "fields": , + "dtypes": "encoding": "", "geometry_type": "", "geometry_name": "", @@ -453,6 +460,7 @@ def open_arrow( dataset_kwargs=dataset_kwargs, batch_size=batch_size, use_pyarrow=use_pyarrow, + datetime_as_string=datetime_as_string, ) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index dbf38e90..560ab710 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -344,6 +344,27 @@ def geojson_bytes(tmp_path): return bytes_buffer +@pytest.fixture(scope="function") +def geojson_datetime_long_ago(tmp_path): + # create GeoJSON file with datetimes from long ago + datetime_tz_geojson = """{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": { "datetime_col": "1670-01-01T09:00:00" }, + "geometry": { "type": "Point", "coordinates": [1, 1] } + } + ] + }""" + + filename = tmp_path / "test_datetime_long_ago.geojson" + with open(filename, "w") as f: + f.write(datetime_tz_geojson) + + return filename + + @pytest.fixture(scope="function") def geojson_filelike(tmp_path): """Extracts first 3 records from naturalearth_lowres and writes to GeoJSON, diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 34b7b4e8..45a46ba9 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -48,6 +48,12 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt + from pandas.api.types import ( + is_datetime64_any_dtype, + is_datetime64_dtype, + is_object_dtype, + is_string_dtype, + ) import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -333,77 +339,439 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +def test_read_datetimes_invalid_param(datetime_file, use_arrow): + with pytest.raises(ValueError, match="Invalid value for 'datetimes'"): + read_dataframe(datetime_file, use_arrow=use_arrow, datetimes="INVALID") + + +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) +def test_read_datetime_long_ago(geojson_datetime_long_ago, use_arrow, datetimes): + """Test writing/reading a column with a datetime far in the past. + + Dates from before 1678-1-1 aren't parsed correctly by pandas < 3.0, so they + stay strings. + Reported in https://github.com/geopandas/pyogrio/issues/553. + """ + if use_arrow and __gdal_version__ < (3, 11, 0): + # With use_arrow and GDAL < 3.11, datetimes are converted to python + # objects in to_pandas. For a datetime far in the past this gives an + # overflow though. + pytest.xfail( + "datetimes before 1678-1-1 give overflow if arrow is used with GDAL<3.11" + ) + if False and not PANDAS_GE_30 and datetimes != "STRING": + pytest.xfail( + "datetimes before 1678-1-1 are not supported with datetimes='MIXED_TO_UTC' " + "with pandas < 3.0" + ) + + df = read_dataframe( + geojson_datetime_long_ago, use_arrow=use_arrow, datetimes=datetimes + ) + + exp_dates = pd.Series(["1670-01-01T09:00:00"], name="datetime_col") + if datetimes == "MIXED_TO_UTC": + pytest.xfail("datetimes of long ago cannot be parsed as UTC") + assert is_datetime64_any_dtype(df.datetime_col.dtype) + assert_series_equal(df.datetime_col, exp_dates) + elif datetimes == "MIXED_TO_DATETIME": + pytest.xfail("datetimes of long ago cannot be parsed as datetime") + assert is_datetime64_dtype(df.datetime_col.dtype) + if PANDAS_GE_20: + exp_dates = pd.to_datetime(exp_dates, format="ISO8601").as_unit("ms") + else: + exp_dates = pd.to_datetime(exp_dates) + assert_series_equal(df.datetime_col, exp_dates) + elif datetimes == "STRING": + assert is_string_dtype(df.datetime_col.dtype) + assert_series_equal(df.datetime_col, exp_dates, check_dtype=False) + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_no_tz(tmp_path, ext, datetimes, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" + dates_raw = ["2020-01-01T09:00:00.123", "2020-01-01T10:00:00", np.nan] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + exp_dates = df.dates.dt.tz_localize("UTC") + if datetimes == "MIXED_TO_DATETIME": + assert_series_equal(result.dates, exp_dates) + elif datetimes == "STRING": + exp_dates = exp_dates.astype("string").str.replace(" ", "T") + assert_series_equal(result.dates, exp_dates) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + if datetimes == "MIXED_TO_UTC": + assert is_datetime64_any_dtype(result.dates.dtype) + assert_series_equal(result.dates, df.dates) + elif datetimes == "MIXED_TO_DATETIME": + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + if use_arrow and __gdal_version__ < (3, 11, 0): + dates_str = df.dates.astype("string").str.replace(" ", "T") + else: + dates_str = pd.Series(dates_raw, name="dates") + assert_series_equal(result.dates, dates_str, check_dtype=False) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api -def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): - df = read_dataframe(datetime_tz_file) - # Make the index non-consecutive to test this case as well. Added for issue - # https://github.com/geopandas/pyogrio/issues/324 - df = df.set_index(np.array([0, 2])) - raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] +def test_write_read_datetime_tz(tmp_path, ext, datetimes, use_arrow): + """Write and read file with all equal timezones. + + This should result in the result being in pandas datetime64 dtype column. + """ + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", np.nan] if PANDAS_GE_20: - expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms") + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: - expected = pd.to_datetime(raw_expected) - expected = pd.Series(expected, name="datetime_col") - assert_series_equal(df.datetime_col, expected, check_index=False) - # test write and read round trips - fpath = tmp_path / "test.gpkg" + dates = pd.to_datetime(dates_raw) + + # Make the index non-consecutive to test this case as well. Added for issue + # https://github.com/geopandas/pyogrio/issues/324 + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, + index=[0, 2, 3], + crs="EPSG:4326", + ) + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) + + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - df_read = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC - expected = expected.dt.tz_convert("UTC") - assert_series_equal(df_read.datetime_col, expected) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + # With some older versions, the offset is represented slightly differently + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(df.dates.dtype) + + if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + df_exp = df.copy() + df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str) + assert_series_equal(result.dates, df_exp.dates, check_index=False) + pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") + + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) + if datetimes == "MIXED_TO_UTC": + assert_series_equal(result.dates, df.dates, check_index=False) + elif datetimes == "MIXED_TO_DATETIME": + assert_series_equal(result.dates, df.dates, check_index=False) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + if use_arrow and __gdal_version__ < (3, 11, 0): + dates_str = df.dates.astype("string").str.replace(" ", "T") + elif __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + dates_str = [x[:-3] for x in dates_raw if pd.notna(x)] + [np.nan] + dates_str = pd.Series(dates_str, name="dates") + else: + dates_str = pd.Series(dates_raw, name="dates") + assert_series_equal( + result.dates, dates_str, check_index=False, check_dtype=False + ) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_datetime_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_tz_localized_mixed_offset( + tmp_path, ext, datetimes, use_arrow +): + """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] - naive_col = pd.Series(pd.to_datetime(dates), name="dates") - localised_col = naive_col.dt.tz_localize("Australia/Sydney") - utc_col = localised_col.dt.tz_convert("UTC") - if PANDAS_GE_20: - utc_col = utc_col.dt.as_unit("ms") + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", np.nan] + dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") + dates_local = dates_naive.dt.tz_localize("Australia/Sydney") + dates_local_offsets_str = dates_local.astype(str) + if datetimes == "MIXED_TO_UTC": + exp_dates = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + exp_dates = exp_dates.dt.as_unit("ms") + elif datetimes == "MIXED_TO_DATETIME": + exp_dates = dates_local_offsets_str.apply( + lambda x: pd.Timestamp(x) if pd.notna(x) else None + ) + elif datetimes == "STRING": + exp_dates = dates_local_offsets_str.str.replace(" ", "T") + exp_dates = exp_dates.str.replace(".111000", ".111") + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + exp_dates = exp_dates.str.slice(0, -3) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") df = gp.GeoDataFrame( - {"dates": localised_col, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + dates_utc = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + dates_utc = dates_utc.dt.as_unit("ms") + if datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + dates_utc = dates_utc.astype(str).str.replace(" ", "T") + assert pd.isna(result.dates[2]) + assert_series_equal( + result.dates.head(2), dates_utc.head(2), check_dtype=False + ) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With GDAL < 3.11 with arrow, datetime columns written as string type + assert pd.isna(result.dates[2]) + assert_series_equal(result.dates.head(2), dates_local_offsets_str.head(2)) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + # GDAL tz only encodes offsets, not timezones - # check multiple offsets are read as utc datetime instead of string values - assert_series_equal(result["dates"], utc_col) + if datetimes == "MIXED_TO_UTC": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + elif datetimes == "MIXED_TO_DATETIME": + assert is_object_dtype(result.dates.dtype) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") + + # Check isna for the third value seperately as depending on versions this is + # different + pandas 3.0 assert_series_equal becomes strict about this. + assert pd.isna(result.dates[2]) + assert_series_equal(result.dates.head(2), exp_dates.head(2), check_dtype=False) + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) +@pytest.mark.filterwarnings( + "ignore: Non-conformant content for record 1 in column dates" +) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, datetimes, use_arrow): + """Test with dates with mixed timezone offsets.""" + # Pandas datetime64 column types doesn't support mixed timezone offsets, so + # it needs to be a list of pandas.Timestamp objects instead. + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + np.nan, + ] + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + df_exp = df.copy() + df_exp.dates = pd.to_datetime(dates, utc=True) + if PANDAS_GE_20: + df_exp.dates = df_exp.dates.dt.as_unit("ms") + if datetimes == "STRING": + df_exp.dates = df_exp.dates.astype("string").str.replace(" ", "T") + df_exp.loc[2, "dates"] = pd.NA + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string + # type columns, so no proper roundtrip possible. + df_exp = df.copy() + df_exp.dates = df_exp.dates.astype("string").astype("O") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + + if datetimes == "MIXED_TO_UTC": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + exp_dates = pd.to_datetime(df.dates, utc=True) + if PANDAS_GE_20: + exp_dates = exp_dates.dt.as_unit("ms") + assert_series_equal(result.dates, exp_dates) + elif datetimes == "MIXED_TO_DATETIME": + assert is_object_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + dates_str = df.dates.map( + lambda x: x.isoformat(timespec="milliseconds") if pd.notna(x) else np.nan + ) + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + dates_str = dates_str.str.slice(0, -3) + assert_series_equal(result.dates, dates_str) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize( + "dates_raw", + [ + ( + pd.Timestamp("2020-01-01T09:00:00.123-05:00"), + pd.Timestamp("2020-01-01T10:00:00-05:00"), + np.nan, + ), + ( + datetime.fromisoformat("2020-01-01T09:00:00.123-05:00"), + datetime.fromisoformat("2020-01-01T10:00:00-05:00"), + np.nan, + ), + ], +) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow): - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT] +def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow, datetimes): + """Datetime objects with equal offsets are read as datetime64.""" + if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): + # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime + # as well as retaining the timezone. + # This was fixed in https://github.com/OSGeo/gdal/pull/11049 + pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") + + dates = pd.Series(dates_raw, dtype="O") + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + # Check result + if PANDAS_GE_20: + exp_dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + exp_dates = pd.to_datetime(dates_raw) + exp_df = df.copy() + exp_df.dates = pd.Series(exp_dates, name="dates") + + # With some older versions, the offset is represented slightly differently + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): + result.dates = result.dates.astype(exp_df.dates.dtype) + + if use_arrow and __gdal_version__ < (3, 11, 0): + if ext in (".fgb", ".gpkg"): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + exp2_df = exp_df.copy() + exp2_df.dates = exp2_df.dates.astype("string").astype("O") + assert_geodataframe_equal(result, exp2_df) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + + if datetimes == "MIXED_TO_UTC": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + exp_df.dates = exp_df.dates.dt.tz_convert("UTC") + elif datetimes == "MIXED_TO_DATETIME": + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + if use_arrow and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + exp_df.dates = exp_df.dates.astype("string").str.replace(" ", "T") + else: + exp_df.dates = df.dates.map( + lambda x: x.isoformat(timespec="milliseconds").replace(".000", "") + if pd.notna(x) + else np.nan + ) + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, timezone minutes aren't included in the string + exp_df.dates = exp_df.dates.str.slice(0, -3) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") + assert_geodataframe_equal(result, exp_df, check_dtype=False) + + +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.parametrize("datetimes", ["MIXED_TO_UTC", "MIXED_TO_DATETIME", "STRING"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_utc(tmp_path, ext, use_arrow, datetimes): + """Test writing/reading a column with UTC datetimes.""" + dates_raw = ["2020-01-01T09:00:00.123Z", "2020-01-01T10:00:00Z", np.nan] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + assert df.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") + + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) - if use_arrow: - # with Arrow, the datetimes are always read as UTC - df["dates"] = df["dates"].dt.tz_convert("UTC") - assert_geodataframe_equal(df, result) + result = read_dataframe(fpath, use_arrow=use_arrow, datetimes=datetimes) + + if use_arrow and ext == ".fgb" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, timezone information is dropped when reading .fgb + if datetimes in ("MIXED_TO_UTC", "MIXED_TO_DATETIME"): + assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + exp_dates = ( + df.dates.dt.tz_localize(None).astype("string").str.replace(" ", "T") + ) + assert_series_equal(result.dates, exp_dates, check_dtype=False) + pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") + + if datetimes in ("MIXED_TO_UTC", "MIXED_TO_DATETIME"): + assert result.dates.dtype.name in ("datetime64[ms, UTC]", "datetime64[ns, UTC]") + assert_geodataframe_equal(result, df) + elif datetimes == "STRING": + assert is_string_dtype(result.dates.dtype) + if use_arrow and __gdal_version__ < (3, 11, 0): + dates_str = df.dates.astype("string").str.replace(" ", "T") + else: + dates_str = pd.Series(dates_raw, name="dates") + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, datetime ends with +00 for UTC, not Z + dates_str = dates_str.str.replace("Z", "+00") + assert_series_equal(result.dates, dates_str, check_dtype=False) + else: + raise ValueError(f"Invalid value for 'datetimes': {datetimes!r}.") def test_read_null_values(tmp_path, use_arrow): diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index e9a6176a..dbb49170 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -1053,9 +1053,14 @@ def test_read_datetime_as_string(datetime_tz_file): field = read(datetime_tz_file, datetime_as_string=True)[3][0] assert field.dtype == "object" - # GDAL doesn't return strings in ISO format (yet) - assert field[0] == "2020/01/01 09:00:00.123-05" - assert field[1] == "2020/01/01 10:00:00-05" + + if __gdal_version__ < (3, 7, 0): + # With GDAL < 3.7, datetimes are not returned as ISO8601 strings + assert field[0] == "2020/01/01 09:00:00.123-05" + assert field[1] == "2020/01/01 10:00:00-05" + else: + assert field[0] == "2020-01-01T09:00:00.123-05:00" + assert field[1] == "2020-01-01T10:00:00-05:00" @pytest.mark.parametrize("ext", ["gpkg", "geojson"])