diff --git a/CHANGES.md b/CHANGES.md index 28c6b975..4806454b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,8 +2,14 @@ ## 0.12.0 (yyyy-mm-dd) +### Potentially breaking changes + +- Return JSON fields (as identified by GDAL) as dicts/lists in `read_dataframe`; + these were previously returned as strings (#556). + ### Improvements +- Add listing of GDAL data types and subtypes to `read_info` (#556). - Add support to read list fields without arrow (#558). ### Bug fixes diff --git a/pyogrio/_compat.py b/pyogrio/_compat.py index 0e4650de..78638f61 100644 --- a/pyogrio/_compat.py +++ b/pyogrio/_compat.py @@ -44,6 +44,7 @@ PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0") PANDAS_GE_30 = pandas is not None and Version(pandas.__version__) >= Version("3.0.0dev") +GDAL_GE_350 = __gdal_version__ >= (3, 5, 0) GDAL_GE_352 = __gdal_version__ >= (3, 5, 2) GDAL_GE_37 = __gdal_version__ >= (3, 7, 0) GDAL_GE_38 = __gdal_version__ >= (3, 8, 0) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index c3cd2373..941a90c7 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -65,6 +65,25 @@ FIELD_TYPES = [ "list(int64)" # OFTInteger64List, List of 64bit integers, not supported ] +# Mapping of OGR integer field types to OGR type names +# (index in array is the integer field type) +FIELD_TYPE_NAMES = { + OFTInteger: "OFTInteger", # Simple 32bit integer + OFTIntegerList: "OFTIntegerList", # List of 32bit integers, not supported + OFTReal: "OFTReal", # Double Precision floating point + OFTRealList: "OFTRealList", # List of doubles, not supported + OFTString: "OFTString", # String of UTF-8 chars + OFTStringList: "OFTStringList", # Array of strings, not supported + OFTWideString: "OFTWideString", # deprecated, not supported + OFTWideStringList: "OFTWideStringList", # deprecated, not supported + OFTBinary: "OFTBinary", # Raw Binary data + OFTDate: "OFTDate", # Date + OFTTime: "OFTTime", # Time: not directly supported in numpy + OFTDateTime: "OFTDateTime", # Date and Time + OFTInteger64: "OFTInteger64", # Single 64bit integer + OFTInteger64List: "OFTInteger64List", # List of 64bit integers, not supported +} + FIELD_SUBTYPES = { OFSTNone: None, # No subtype OFSTBoolean: "bool", # Boolean integer @@ -72,6 +91,16 @@ FIELD_SUBTYPES = { OFSTFloat32: "float32", # Single precision (32 bit) floating point } +FIELD_SUBTYPE_NAMES = { + OFSTNone: "OFSTNone", # No subtype + OFSTBoolean: "OFSTBoolean", # Boolean integer + OFSTInt16: "OFSTInt16", # Signed 16-bit integer + OFSTFloat32: "OFSTFloat32", # Single precision (32 bit) floating point + OFSTJSON: "OFSTJSON", + OFSTUUID: "OFSTUUID", + OFSTMaxSubType: "OFSTMaxSubType", +} + # Mapping of numpy ndarray dtypes to (field type, subtype) DTYPE_OGR_FIELD_TYPES = { "int8": (OFTInteger, OFSTInt16), @@ -633,8 +662,8 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False): Returns ------- - ndarray(n, 4) - array of index, ogr type, name, numpy type + ndarray(n, 5) + array of index, ogr type, name, numpy type, ogr subtype """ cdef int i cdef int field_count @@ -654,7 +683,7 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False): field_count = OGR_FD_GetFieldCount(ogr_featuredef) - fields = np.empty(shape=(field_count, 4), dtype=object) + fields = np.empty(shape=(field_count, 5), dtype=object) fields_view = fields[:, :] skipped_fields = False @@ -691,6 +720,7 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False): fields_view[i, 1] = field_type fields_view[i, 2] = field_name fields_view[i, 3] = np_type + fields_view[i, 4] = field_subtype if skipped_fields: # filter out skipped fields @@ -1472,11 +1502,18 @@ def ogr_read( datetime_as_string=datetime_as_string ) + ogr_types = [FIELD_TYPE_NAMES.get(field[1], "Unknown") for field in fields] + ogr_subtypes = [ + FIELD_SUBTYPE_NAMES.get(field[4], "Unknown") for field in fields + ] + meta = { "crs": crs, "encoding": encoding, "fields": fields[:, 2], "dtypes": fields[:, 3], + "ogr_types": ogr_types, + "ogr_subtypes": ogr_subtypes, "geometry_type": geometry_type, } @@ -1804,10 +1841,18 @@ def ogr_open_arrow( else: reader = _ArrowStream(capsule) + ogr_types = [FIELD_TYPE_NAMES.get(field[1], "Unknown") for field in fields] + ogr_subtypes = [ + FIELD_SUBTYPE_NAMES.get(field[4], "Unknown") for field in fields + ] + meta = { "crs": crs, "encoding": encoding, "fields": fields[:, 2], + "dtypes": fields[:, 3], + "ogr_types": ogr_types, + "ogr_subtypes": ogr_subtypes, "geometry_type": geometry_type, "geometry_name": geometry_name, "fid_column": fid_column, @@ -1964,6 +2009,10 @@ def ogr_read_info( encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) fields = get_fields(ogr_layer, encoding) + ogr_types = [FIELD_TYPE_NAMES.get(field[1], "Unknown") for field in fields] + ogr_subtypes = [ + FIELD_SUBTYPE_NAMES.get(field[4], "Unknown") for field in fields + ] meta = { "layer_name": get_string(OGR_L_GetName(ogr_layer)), @@ -1971,6 +2020,8 @@ def ogr_read_info( "encoding": encoding, "fields": fields[:, 2], "dtypes": fields[:, 3], + "ogr_types": ogr_types, + "ogr_subtypes": ogr_subtypes, "fid_column": get_string(OGR_L_GetFIDColumn(ogr_layer)), "geometry_name": get_string(OGR_L_GetGeometryColumn(ogr_layer)), "geometry_type": get_geometry_type(ogr_layer), diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 78b66b96..bed01e2d 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -185,6 +185,9 @@ cdef extern from "ogr_core.h": OFSTBoolean OFSTInt16 OFSTFloat32 + OFSTJSON + OFSTUUID + OFSTMaxSubType ctypedef void* OGRDataSourceH ctypedef void* OGRFeatureDefnH diff --git a/pyogrio/core.py b/pyogrio/core.py index 1fa18fa4..4ae00158 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -261,6 +261,8 @@ def read_info( "crs": "", "fields": , "dtypes": , + "ogr_types": , + "ogr_subtypes": , "encoding": "", "fid_column": "", "geometry_name": "", diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index ce57575b..55c84527 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -1,5 +1,6 @@ """Functions for reading and writing GeoPandas dataframes.""" +import json import os import warnings @@ -330,6 +331,10 @@ def read_dataframe( del table + for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns): + if ogr_subtype == "OFSTJSON": + df[c] = df[c].map(json.loads, na_action="ignore") + if fid_as_index: df = df.set_index(meta["fid_column"]) df.index.names = ["fid"] @@ -362,6 +367,9 @@ def read_dataframe( for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): df[c] = _try_parse_datetime(df[c]) + for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns): + if ogr_subtype == "OFSTJSON": + df[c] = df[c].map(json.loads, na_action="ignore") if geometry is None or not read_geometry: return df diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 0f0c3063..098a9da5 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -171,9 +171,11 @@ def read( Meta is: { "crs": "", "fields": , - "dtypes": + "dtypes": , + "ogr_types": , + "ogr_subtypes": , "encoding": "", - "geometry_type": "" + "geometry_type": "", } .. _OGRSQL: @@ -249,9 +251,13 @@ def read_arrow( Meta is: { "crs": "", "fields": , + "dtypes": , + "ogr_types": , + "ogr_subtypes": , "encoding": "", "geometry_type": "", "geometry_name": "", + "fid_column": "" } """ @@ -691,9 +697,11 @@ def write( options will trigger a warning. """ - # if dtypes is given, remove it from kwargs (dtypes is included in meta returned by + # remove some unneeded kwargs (e.g. dtypes is included in meta returned by # read, and it is convenient to pass meta directly into write for round trip tests) kwargs.pop("dtypes", None) + kwargs.pop("ogr_types", None) + kwargs.pop("ogr_subtypes", None) path, driver = _get_write_path_driver(path, driver, append=append) diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index ee0d6739..08109f81 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -214,8 +214,8 @@ def list_field_values_file(tmp_path): { "type": "Feature", "properties": { - "int64": 1, - "list_int64": [0, 1], + "int": 1, + "list_int": [0, 1], "list_double": [0.0, 1.0], "list_string": ["string1", "string2"], "list_int_with_null": [0, null], @@ -226,8 +226,8 @@ def list_field_values_file(tmp_path): { "type": "Feature", "properties": { - "int64": 2, - "list_int64": [2, 3], + "int": 2, + "list_int": [2, 3], "list_double": [2.0, 3.0], "list_string": ["string3", "string4", ""], "list_int_with_null": [2, 3], @@ -238,8 +238,8 @@ def list_field_values_file(tmp_path): { "type": "Feature", "properties": { - "int64": 3, - "list_int64": [], + "int": 3, + "list_int": [], "list_double": [], "list_string": [], "list_int_with_null": [], @@ -250,8 +250,8 @@ def list_field_values_file(tmp_path): { "type": "Feature", "properties": { - "int64": 4, - "list_int64": null, + "int": 4, + "list_int": null, "list_double": null, "list_string": null, "list_int_with_null": null, @@ -262,8 +262,8 @@ def list_field_values_file(tmp_path): { "type": "Feature", "properties": { - "int64": 5, - "list_int64": null, + "int": 5, + "list_int": null, "list_double": null, "list_string": [""], "list_int_with_null": null, diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 4949b419..11af010e 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -18,7 +18,7 @@ vsi_rmtree, vsi_unlink, ) -from pyogrio._compat import GDAL_GE_38 +from pyogrio._compat import GDAL_GE_38, GDAL_GE_350 from pyogrio._env import GDALEnv from pyogrio.errors import DataLayerError, DataSourceError from pyogrio.raw import read, write @@ -583,6 +583,17 @@ def test_read_info_force_total_bounds( assert info["total_bounds"] is None +def test_read_info_jsonfield(nested_geojson_file): + """Test if JSON fields types are returned correctly.""" + meta = read_info(nested_geojson_file) + assert meta["ogr_types"] == ["OFTString", "OFTString"] + if GDAL_GE_350: + # OFSTJSON is only supported for GDAL >= 3.5 + assert meta["ogr_subtypes"] == ["OFSTNone", "OFSTJSON"] + else: + assert meta["ogr_subtypes"] == ["OFSTNone", "OFSTNone"] + + def test_read_info_unspecified_layer_warning(data_dir): """Reading a multi-layer file without specifying a layer gives a warning.""" with pytest.warns(UserWarning, match="More than one layer found "): diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fb61308a..32bf0711 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -20,6 +20,7 @@ from pyogrio._compat import ( GDAL_GE_37, GDAL_GE_311, + GDAL_GE_350, GDAL_GE_352, HAS_ARROW_WRITE_API, HAS_PYPROJ, @@ -370,25 +371,36 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): assert_series_equal(df_read.datetime_col, expected) +@pytest.mark.skipif( + not GDAL_GE_350, + reason="OFSTJSON subtype + some list type situations need GDAL >= 3.5", +) def test_read_list_types(list_field_values_file, use_arrow): - if not GDAL_GE_352: - pytest.xfail(reason="GDAL 3.4.3 didn't handle all list types perfectly") - + """Test reading a geojson file containing fields with lists.""" + info = read_info(list_field_values_file) result = read_dataframe(list_field_values_file, use_arrow=use_arrow) - assert "list_int64" in result.columns - assert result["list_int64"][0].tolist() == [0, 1] - assert result["list_int64"][1].tolist() == [2, 3] - assert result["list_int64"][2].tolist() == [] - assert result["list_int64"][3] is None - assert result["list_int64"][4] is None + assert "list_int" in result.columns + assert info["fields"][1] == "list_int" + assert info["ogr_types"][1] == "OFTIntegerList" + assert result["list_int"][0].tolist() == [0, 1] + assert result["list_int"][1].tolist() == [2, 3] + assert result["list_int"][2].tolist() == [] + assert result["list_int"][3] is None + assert result["list_int"][4] is None + assert "list_double" in result.columns + assert info["fields"][2] == "list_double" + assert info["ogr_types"][2] == "OFTRealList" assert result["list_double"][0].tolist() == [0.0, 1.0] assert result["list_double"][1].tolist() == [2.0, 3.0] assert result["list_double"][2].tolist() == [] assert result["list_double"][3] is None assert result["list_double"][4] is None + assert "list_string" in result.columns + assert info["fields"][3] == "list_string" + assert info["ogr_types"][3] == "OFTStringList" assert result["list_string"][0].tolist() == ["string1", "string2"] assert result["list_string"][1].tolist() == ["string3", "string4", ""] assert result["list_string"][2].tolist() == [] @@ -396,20 +408,28 @@ def test_read_list_types(list_field_values_file, use_arrow): assert result["list_string"][4] == [""] # Once any row of a column contains a null value in a list (in the test geojson), - # the column isn't recognized as a list column anymore and the values are returned - # as strings. + # the column isn't recognized as a list column anymore, but as a JSON column. + # Because JSON columns containing JSON Arrays are also parsed to python lists, the + # end result is the same... assert "list_int_with_null" in result.columns - assert result["list_int_with_null"][0] == "[ 0, null ]" - assert result["list_int_with_null"][1] == "[ 2, 3 ]" - assert result["list_int_with_null"][2] == "[ ]" + assert info["fields"][4] == "list_int_with_null" + assert info["ogr_types"][4] == "OFTString" + assert info["ogr_subtypes"][4] == "OFSTJSON" + assert result["list_int_with_null"][0] == [0, None] + assert result["list_int_with_null"][1] == [2, 3] + assert result["list_int_with_null"][2] == [] assert pd.isna(result["list_int_with_null"][3]) assert pd.isna(result["list_int_with_null"][4]) + assert "list_string_with_null" in result.columns - assert result["list_string_with_null"][0] == '[ "string1", null ]' - assert result["list_string_with_null"][1] == '[ "string3", "string4", "" ]' - assert result["list_string_with_null"][2] == "[ ]" + assert info["fields"][5] == "list_string_with_null" + assert info["ogr_types"][5] == "OFTString" + assert info["ogr_subtypes"][5] == "OFSTJSON" + assert result["list_string_with_null"][0] == ["string1", None] + assert result["list_string_with_null"][1] == ["string3", "string4", ""] + assert result["list_string_with_null"][2] == [] assert pd.isna(result["list_string_with_null"][3]) - assert result["list_string_with_null"][4] == '[ "" ]' + assert result["list_string_with_null"][4] == [""] @pytest.mark.filterwarnings( @@ -2052,6 +2072,9 @@ def test_read_multisurface(multisurface_file, use_arrow): assert df.geometry.type.tolist() == ["MultiPolygon"] +@pytest.mark.skipif( + not GDAL_GE_350, reason="OFSTJSON subtype only supported for GDAL >= 3.5" +) def test_read_dataset_kwargs(nested_geojson_file, use_arrow): # by default, nested data are not flattened df = read_dataframe(nested_geojson_file, use_arrow=use_arrow) @@ -2059,7 +2082,7 @@ def test_read_dataset_kwargs(nested_geojson_file, use_arrow): expected = gp.GeoDataFrame( { "top_level": ["A"], - "intermediate_level": ['{ "bottom_level": "B" }'], + "intermediate_level": [{"bottom_level": "B"}], }, geometry=[shapely.Point(0, 0)], crs="EPSG:4326",