Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,14 @@

## 0.12.0 (yyyy-mm-dd)

### Potentially breaking changes

- Return JSON fields (as identified by GDAL) as dicts/lists in `read_dataframe`;
these were previously returned as strings (#556).

### Improvements

- Add listing of GDAL data types and subtypes to `read_info` (#556).
- Add support to read list fields without arrow (#558).

### Bug fixes
Expand Down
1 change: 1 addition & 0 deletions pyogrio/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
PANDAS_GE_22 = pandas is not None and Version(pandas.__version__) >= Version("2.2.0")
PANDAS_GE_30 = pandas is not None and Version(pandas.__version__) >= Version("3.0.0dev")

GDAL_GE_350 = __gdal_version__ >= (3, 5, 0)
GDAL_GE_352 = __gdal_version__ >= (3, 5, 2)
GDAL_GE_37 = __gdal_version__ >= (3, 7, 0)
GDAL_GE_38 = __gdal_version__ >= (3, 8, 0)
Expand Down
57 changes: 54 additions & 3 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,42 @@ FIELD_TYPES = [
"list(int64)" # OFTInteger64List, List of 64bit integers, not supported
]

# Mapping of OGR integer field types to OGR type names
# (index in array is the integer field type)
FIELD_TYPE_NAMES = {
OFTInteger: "OFTInteger", # Simple 32bit integer
OFTIntegerList: "OFTIntegerList", # List of 32bit integers, not supported
OFTReal: "OFTReal", # Double Precision floating point
OFTRealList: "OFTRealList", # List of doubles, not supported
OFTString: "OFTString", # String of UTF-8 chars
OFTStringList: "OFTStringList", # Array of strings, not supported
OFTWideString: "OFTWideString", # deprecated, not supported
OFTWideStringList: "OFTWideStringList", # deprecated, not supported
OFTBinary: "OFTBinary", # Raw Binary data
OFTDate: "OFTDate", # Date
OFTTime: "OFTTime", # Time: not directly supported in numpy
OFTDateTime: "OFTDateTime", # Date and Time
OFTInteger64: "OFTInteger64", # Single 64bit integer
OFTInteger64List: "OFTInteger64List", # List of 64bit integers, not supported
}

FIELD_SUBTYPES = {
OFSTNone: None, # No subtype
OFSTBoolean: "bool", # Boolean integer
OFSTInt16: "int16", # Signed 16-bit integer
OFSTFloat32: "float32", # Single precision (32 bit) floating point
}

FIELD_SUBTYPE_NAMES = {
OFSTNone: "OFSTNone", # No subtype
OFSTBoolean: "OFSTBoolean", # Boolean integer
OFSTInt16: "OFSTInt16", # Signed 16-bit integer
OFSTFloat32: "OFSTFloat32", # Single precision (32 bit) floating point
OFSTJSON: "OFSTJSON",
OFSTUUID: "OFSTUUID",
OFSTMaxSubType: "OFSTMaxSubType",
}

# Mapping of numpy ndarray dtypes to (field type, subtype)
DTYPE_OGR_FIELD_TYPES = {
"int8": (OFTInteger, OFSTInt16),
Expand Down Expand Up @@ -633,8 +662,8 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False):

Returns
-------
ndarray(n, 4)
array of index, ogr type, name, numpy type
ndarray(n, 5)
array of index, ogr type, name, numpy type, ogr subtype
"""
cdef int i
cdef int field_count
Expand All @@ -654,7 +683,7 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False):

field_count = OGR_FD_GetFieldCount(ogr_featuredef)

fields = np.empty(shape=(field_count, 4), dtype=object)
fields = np.empty(shape=(field_count, 5), dtype=object)
fields_view = fields[:, :]

skipped_fields = False
Expand Down Expand Up @@ -691,6 +720,7 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False):
fields_view[i, 1] = field_type
fields_view[i, 2] = field_name
fields_view[i, 3] = np_type
fields_view[i, 4] = field_subtype

if skipped_fields:
# filter out skipped fields
Expand Down Expand Up @@ -1472,11 +1502,18 @@ def ogr_read(
datetime_as_string=datetime_as_string
)

ogr_types = [FIELD_TYPE_NAMES.get(field[1], "Unknown") for field in fields]
ogr_subtypes = [
FIELD_SUBTYPE_NAMES.get(field[4], "Unknown") for field in fields
]

meta = {
"crs": crs,
"encoding": encoding,
"fields": fields[:, 2],
"dtypes": fields[:, 3],
"ogr_types": ogr_types,
"ogr_subtypes": ogr_subtypes,
"geometry_type": geometry_type,
}

Expand Down Expand Up @@ -1804,10 +1841,18 @@ def ogr_open_arrow(
else:
reader = _ArrowStream(capsule)

ogr_types = [FIELD_TYPE_NAMES.get(field[1], "Unknown") for field in fields]
ogr_subtypes = [
FIELD_SUBTYPE_NAMES.get(field[4], "Unknown") for field in fields
]

meta = {
"crs": crs,
"encoding": encoding,
"fields": fields[:, 2],
"dtypes": fields[:, 3],
"ogr_types": ogr_types,
"ogr_subtypes": ogr_subtypes,
"geometry_type": geometry_type,
"geometry_name": geometry_name,
"fid_column": fid_column,
Expand Down Expand Up @@ -1964,13 +2009,19 @@ def ogr_read_info(
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer)

fields = get_fields(ogr_layer, encoding)
ogr_types = [FIELD_TYPE_NAMES.get(field[1], "Unknown") for field in fields]
ogr_subtypes = [
FIELD_SUBTYPE_NAMES.get(field[4], "Unknown") for field in fields
]

meta = {
"layer_name": get_string(OGR_L_GetName(ogr_layer)),
"crs": get_crs(ogr_layer),
"encoding": encoding,
"fields": fields[:, 2],
"dtypes": fields[:, 3],
"ogr_types": ogr_types,
"ogr_subtypes": ogr_subtypes,
"fid_column": get_string(OGR_L_GetFIDColumn(ogr_layer)),
"geometry_name": get_string(OGR_L_GetGeometryColumn(ogr_layer)),
"geometry_type": get_geometry_type(ogr_layer),
Expand Down
3 changes: 3 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ cdef extern from "ogr_core.h":
OFSTBoolean
OFSTInt16
OFSTFloat32
OFSTJSON
OFSTUUID
OFSTMaxSubType

ctypedef void* OGRDataSourceH
ctypedef void* OGRFeatureDefnH
Expand Down
2 changes: 2 additions & 0 deletions pyogrio/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@ def read_info(
"crs": "<crs>",
"fields": <ndarray of field names>,
"dtypes": <ndarray of field dtypes>,
"ogr_types": <ndarray of OGR field types>,
"ogr_subtypes": <ndarray of OGR field subtypes>,
"encoding": "<encoding>",
"fid_column": "<fid column name or "">",
"geometry_name": "<geometry column name or "">",
Expand Down
8 changes: 8 additions & 0 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Functions for reading and writing GeoPandas dataframes."""

import json
import os
import warnings

Expand Down Expand Up @@ -330,6 +331,10 @@ def read_dataframe(

del table

for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
if ogr_subtype == "OFSTJSON":
df[c] = df[c].map(json.loads, na_action="ignore")

if fid_as_index:
df = df.set_index(meta["fid_column"])
df.index.names = ["fid"]
Expand Down Expand Up @@ -362,6 +367,9 @@ def read_dataframe(
for dtype, c in zip(meta["dtypes"], df.columns):
if dtype.startswith("datetime"):
df[c] = _try_parse_datetime(df[c])
for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
if ogr_subtype == "OFSTJSON":
df[c] = df[c].map(json.loads, na_action="ignore")

if geometry is None or not read_geometry:
return df
Expand Down
14 changes: 11 additions & 3 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,11 @@ def read(
Meta is: {
"crs": "<crs>",
"fields": <ndarray of field names>,
"dtypes": <ndarray of numpy dtypes corresponding to fields>
"dtypes": <ndarray of numpy dtypes corresponding to fields>,
"ogr_types": <ndarray of OGR types corresponding to fields>,
"ogr_subtypes": <ndarray of OGR subtypes corresponding to fields>,
"encoding": "<encoding>",
"geometry_type": "<geometry type>"
"geometry_type": "<geometry type>",
}

.. _OGRSQL:
Expand Down Expand Up @@ -249,9 +251,13 @@ def read_arrow(
Meta is: {
"crs": "<crs>",
"fields": <ndarray of field names>,
"dtypes": <ndarray of numpy dtypes corresponding to fields>,
"ogr_types": <ndarray of OGR types corresponding to fields>,
"ogr_subtypes": <ndarray of OGR subtypes corresponding to fields>,
"encoding": "<encoding>",
"geometry_type": "<geometry_type>",
"geometry_name": "<name of geometry column in arrow table>",
"fid_column": "<name of FID column in arrow table>"
}

"""
Expand Down Expand Up @@ -691,9 +697,11 @@ def write(
options will trigger a warning.

"""
# if dtypes is given, remove it from kwargs (dtypes is included in meta returned by
# remove some unneeded kwargs (e.g. dtypes is included in meta returned by
# read, and it is convenient to pass meta directly into write for round trip tests)
kwargs.pop("dtypes", None)
kwargs.pop("ogr_types", None)
kwargs.pop("ogr_subtypes", None)

path, driver = _get_write_path_driver(path, driver, append=append)

Expand Down
20 changes: 10 additions & 10 deletions pyogrio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@ def list_field_values_file(tmp_path):
{
"type": "Feature",
"properties": {
"int64": 1,
"list_int64": [0, 1],
"int": 1,
"list_int": [0, 1],
"list_double": [0.0, 1.0],
"list_string": ["string1", "string2"],
"list_int_with_null": [0, null],
Expand All @@ -226,8 +226,8 @@ def list_field_values_file(tmp_path):
{
"type": "Feature",
"properties": {
"int64": 2,
"list_int64": [2, 3],
"int": 2,
"list_int": [2, 3],
"list_double": [2.0, 3.0],
"list_string": ["string3", "string4", ""],
"list_int_with_null": [2, 3],
Expand All @@ -238,8 +238,8 @@ def list_field_values_file(tmp_path):
{
"type": "Feature",
"properties": {
"int64": 3,
"list_int64": [],
"int": 3,
"list_int": [],
"list_double": [],
"list_string": [],
"list_int_with_null": [],
Expand All @@ -250,8 +250,8 @@ def list_field_values_file(tmp_path):
{
"type": "Feature",
"properties": {
"int64": 4,
"list_int64": null,
"int": 4,
"list_int": null,
"list_double": null,
"list_string": null,
"list_int_with_null": null,
Expand All @@ -262,8 +262,8 @@ def list_field_values_file(tmp_path):
{
"type": "Feature",
"properties": {
"int64": 5,
"list_int64": null,
"int": 5,
"list_int": null,
"list_double": null,
"list_string": [""],
"list_int_with_null": null,
Expand Down
13 changes: 12 additions & 1 deletion pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
vsi_rmtree,
vsi_unlink,
)
from pyogrio._compat import GDAL_GE_38
from pyogrio._compat import GDAL_GE_38, GDAL_GE_350
from pyogrio._env import GDALEnv
from pyogrio.errors import DataLayerError, DataSourceError
from pyogrio.raw import read, write
Expand Down Expand Up @@ -583,6 +583,17 @@ def test_read_info_force_total_bounds(
assert info["total_bounds"] is None


def test_read_info_jsonfield(nested_geojson_file):
"""Test if JSON fields types are returned correctly."""
meta = read_info(nested_geojson_file)
assert meta["ogr_types"] == ["OFTString", "OFTString"]
if GDAL_GE_350:
# OFSTJSON is only supported for GDAL >= 3.5
assert meta["ogr_subtypes"] == ["OFSTNone", "OFSTJSON"]
else:
assert meta["ogr_subtypes"] == ["OFSTNone", "OFSTNone"]


def test_read_info_unspecified_layer_warning(data_dir):
"""Reading a multi-layer file without specifying a layer gives a warning."""
with pytest.warns(UserWarning, match="More than one layer found "):
Expand Down
Loading
Loading