From cf092b2a0d85d3e05352fdf5a82396554cfbd1bd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Aug 2022 16:19:45 +0200 Subject: [PATCH 01/18] ENH: use new columnar GetArrowStream if GDAL>=3.6 and pyarrow available --- pyogrio/_io.pyx | 138 ++++++++++++++++++++++++++++++++++++++++- pyogrio/_ogr.pxd | 12 +++- pyogrio/arrow_bridge.h | 109 ++++++++++++++++++++++++++++++++ pyogrio/geopandas.py | 9 +++ pyogrio/raw.py | 9 ++- 5 files changed, 273 insertions(+), 4 deletions(-) create mode 100644 pyogrio/arrow_bridge.h diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 0841f425..49459833 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -11,7 +11,7 @@ import math import os import warnings -from libc.stdint cimport uint8_t +from libc.stdint cimport uint8_t, uintptr_t from libc.stdlib cimport malloc, free from libc.string cimport strlen from libc.math cimport isnan @@ -931,6 +931,142 @@ def ogr_read( ) +def ogr_read_arrow( + str path, + object layer=None, + object encoding=None, + int read_geometry=True, + int force_2d=False, + object columns=None, + int skip_features=0, + int max_features=0, + object where=None, + tuple bbox=None, + object fids=None, + str sql=None, + str sql_dialect=None, + int return_fids=False, + **kwargs): + + cdef int err = 0 + cdef const char *path_c = NULL + cdef const char *where_c = NULL + cdef OGRDataSourceH ogr_dataset = NULL + cdef OGRLayerH ogr_layer = NULL + cdef int feature_count = 0 + cdef double xmin, ymin, xmax, ymax + cdef ArrowArrayStream stream + cdef ArrowSchema schema + + path_b = path.encode('utf-8') + path_c = path_b + + if fids is not None: + if where is not None or bbox is not None or sql is not None or skip_features or max_features: + raise ValueError( + "cannot set both 'fids' and any of 'where', 'bbox', 'sql', " + "'skip_features' or 'max_features'" + ) + fids = np.asarray(fids, dtype=np.intc) + + if sql is not None and layer is not None: + raise ValueError("'sql' paramater cannot be combined with 'layer'") + + ogr_dataset = ogr_open(path_c, 0, kwargs) + try: + if sql is None: + # layer defaults to index 0 + if layer is None: + layer = 0 + ogr_layer = get_ogr_layer(ogr_dataset, layer) + else: + ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect) + + crs = get_crs(ogr_layer) + + # Encoding is derived from the user, from the dataset capabilities / type, + # or from the system locale + encoding = ( + encoding + or detect_encoding(ogr_dataset, ogr_layer) + or locale.getpreferredencoding() + ) + + fields = get_fields(ogr_layer, encoding) + + if columns is not None: + # Fields are matched exactly by name, duplicates are dropped. + # Find index of each field into fields + idx = np.intersect1d(fields[:,2], columns, return_indices=True)[1] + fields = fields[idx, :] + + geometry_type = get_geometry_type(ogr_layer) + + geometry_name = get_string(OGR_L_GetGeometryColumn(ogr_layer)) + + if fids is not None: + raise ValueError("reading by FID not supported for arrow") + + # Apply the attribute filter + if where is not None and where != "": + apply_where_filter(ogr_layer, where) + + # Apply the spatial filter + if bbox is not None: + apply_spatial_filter(ogr_layer, bbox) + + # Limit feature range to available range + skip_features, max_features = validate_feature_range( + ogr_layer, skip_features, max_features + ) + + # make sure layer is read from beginning + OGR_L_ResetReading(ogr_layer) + + if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL): + raise RuntimeError("Failed to open ArrowArrayStream from Layer") + + stream_ptr = &stream + + import pyarrow as pa + table = pa.RecordBatchStreamReader._import_from_c(stream_ptr).read_all() + + # fid_data, geometries, field_data = get_features( + # ogr_layer, + # fields, + # encoding, + # read_geometry=read_geometry and geometry_type is not None, + # force_2d=force_2d, + # skip_features=skip_features, + # max_features=max_features, + # return_fids=return_fids + # ) + + meta = { + 'crs': crs, + 'encoding': encoding, + 'fields': fields[:,2], # return only names + 'geometry_type': geometry_type, + 'geometry_name': geometry_name, + } + + finally: + pass + if ogr_dataset != NULL: + if sql is not None: + GDALDatasetReleaseResultSet(ogr_dataset, ogr_layer) + + GDALClose(ogr_dataset) + ogr_dataset = NULL + + return ( + meta, + table, + None, #geometries, + None, #field_data + ) + + def ogr_read_bounds( str path, object layer=None, diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 95bf0fd7..09ec4baa 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -1,5 +1,5 @@ # Contains declarations against GDAL / OGR API -from libc.stdint cimport int64_t +from libc.stdint cimport int64_t, int8_t from libc.stdio cimport FILE @@ -184,6 +184,14 @@ cdef extern from "ogr_srs_api.h": void OSRRelease(OGRSpatialReferenceH srs) +cdef extern from "arrow_bridge.h": + struct ArrowSchema: + int64_t n_children + + struct ArrowArrayStream: + int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out) + + cdef extern from "ogr_api.h": int OGRGetDriverCount() OGRSFDriverH OGRGetDriver(int) @@ -264,6 +272,7 @@ cdef extern from "ogr_api.h": OGRErr OGR_L_CreateFeature(OGRLayerH layer, OGRFeatureH feature) OGRErr OGR_L_CreateField(OGRLayerH layer, OGRFieldDefnH fielddefn, int flexible) const char* OGR_L_GetName(OGRLayerH layer) + const char* OGR_L_GetGeometryColumn(OGRLayerH layer) OGRSpatialReferenceH OGR_L_GetSpatialRef(OGRLayerH layer) int OGR_L_TestCapability(OGRLayerH layer, const char *name) OGRFeatureDefnH OGR_L_GetLayerDefn(OGRLayerH layer) @@ -274,6 +283,7 @@ cdef extern from "ogr_api.h": OGRErr OGR_L_SetNextByIndex(OGRLayerH layer, int nIndex) int OGR_L_GetFeatureCount(OGRLayerH layer, int m) void OGR_L_SetSpatialFilterRect(OGRLayerH layer, double xmin, double ymin, double xmax, double ymax) + int8_t OGR_L_GetArrowStream(OGRLayerH hLayer, ArrowArrayStream *out_stream, char** papszOptions) void OGRSetNonLinearGeometriesEnabledFlag(int bFlag) int OGRGetNonLinearGeometriesEnabledFlag() diff --git a/pyogrio/arrow_bridge.h b/pyogrio/arrow_bridge.h new file mode 100644 index 00000000..d269e765 --- /dev/null +++ b/pyogrio/arrow_bridge.h @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is an extract https://github.com/apache/arrow/blob/master/cpp/src/arrow/c/abi.h +// WARNING: DO NOT MODIFY the content as it would break interoperability ! + +#pragma once + +/*! @cond Doxygen_Suppress */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; +// EXPERIMENTAL: C stream interface + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#ifdef __cplusplus +} +#endif + +/*! @endcond */ diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 9718297c..5c940471 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -33,6 +33,7 @@ def read_dataframe( sql=None, sql_dialect=None, fid_as_index=False, + use_arrow=False, ): """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame. If the data source does not have a geometry column or ``read_geometry`` is False, @@ -146,8 +147,16 @@ def read_dataframe( sql=sql, sql_dialect=sql_dialect, return_fids=fid_as_index, + use_arrow=use_arrow, ) + if use_arrow: + table = index + df = table.to_pandas() + geometry_name = meta["geometry_name"] or df.columns[-1] + df["geometry"] = from_wkb(df.pop(geometry_name), crs=meta["crs"]) + return gp.GeoDataFrame(df, geometry="geometry") + columns = meta["fields"].tolist() data = {columns[i]: field_data[i] for i in range(len(columns))} if fid_as_index: diff --git a/pyogrio/raw.py b/pyogrio/raw.py index e046d2b9..63d66539 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -5,7 +5,7 @@ from pyogrio.util import get_vsi_path with GDALEnv(): - from pyogrio._io import ogr_read, ogr_write + from pyogrio._io import ogr_read, ogr_read_arrow, ogr_write from pyogrio._ogr import remove_virtual_file @@ -42,6 +42,7 @@ def read( sql=None, sql_dialect=None, return_fids=False, + use_arrow=False, ): """Read OGR data source. @@ -114,7 +115,11 @@ def read( path, buffer = get_vsi_path(path_or_buffer) try: - result = ogr_read( + if use_arrow: + func = ogr_read_arrow + else: + func = ogr_read + result = func( path, layer=layer, encoding=encoding, From 13d4b6138e2b77ac2fae0398af1d385bc4214660 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Sep 2022 15:09:34 +0200 Subject: [PATCH 02/18] use compile_time variable --- pyogrio/_io.pyx | 9 +++++++-- pyogrio/_ogr.pxd | 7 ++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 49459833..29ce6ccc 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1023,8 +1023,13 @@ def ogr_read_arrow( # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) - if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL): - raise RuntimeError("Failed to open ArrowArrayStream from Layer") + IF CTE_GDAL_VERSION >= (3, 6, 0): + + if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL): + raise RuntimeError("Failed to open ArrowArrayStream from Layer") + + ELSE: + raise RuntimeError("Need GDAL>=3.6 for Arrow support") stream_ptr = &stream diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 09ec4baa..b8106e1d 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -283,7 +283,6 @@ cdef extern from "ogr_api.h": OGRErr OGR_L_SetNextByIndex(OGRLayerH layer, int nIndex) int OGR_L_GetFeatureCount(OGRLayerH layer, int m) void OGR_L_SetSpatialFilterRect(OGRLayerH layer, double xmin, double ymin, double xmax, double ymax) - int8_t OGR_L_GetArrowStream(OGRLayerH hLayer, ArrowArrayStream *out_stream, char** papszOptions) void OGRSetNonLinearGeometriesEnabledFlag(int bFlag) int OGRGetNonLinearGeometriesEnabledFlag() @@ -296,6 +295,12 @@ cdef extern from "ogr_api.h": const char* OLCFastSpatialFilter +IF CTE_GDAL_VERSION >= (3, 6, 0): + + cdef extern from "ogr_api.h": + int8_t OGR_L_GetArrowStream(OGRLayerH hLayer, ArrowArrayStream *out_stream, char** papszOptions) + + cdef extern from "gdal.h": ctypedef enum GDALDataType: GDT_Unknown From 4eaba0d15e83d6ee5a25bf58bd9552e475034c5a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Sep 2022 15:23:37 +0200 Subject: [PATCH 03/18] Check if pyarrow is available --- pyogrio/raw.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 63d66539..72c06bd6 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -116,6 +116,12 @@ def read( try: if use_arrow: + try: + import pyarrow # noqa + except ImportError: + raise RuntimeError( + "the 'pyarrow' package is required to read using arrow" + ) func = ogr_read_arrow else: func = ogr_read From 9389dc4e4290783622d6c557ebec60ca0d45f800 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Sep 2022 15:38:23 +0200 Subject: [PATCH 04/18] add basic test --- pyogrio/tests/test_arrow.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 pyogrio/tests/test_arrow.py diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py new file mode 100644 index 00000000..a2909068 --- /dev/null +++ b/pyogrio/tests/test_arrow.py @@ -0,0 +1,35 @@ +import pytest + +from pyogrio import __gdal_version__, read_dataframe + +try: + from geopandas.testing import assert_geodataframe_equal +except ImportError: + pass + + +pytest.importorskip("geopandas") +pytest.importorskip("pyarrow") + +pytestmark = pytest.mark.skipif( + __gdal_version__ < (3, 6, 0), reason="Arrow tests require GDAL>=3.6" +) + + +def test_read_arrow(naturalearth_lowres_all_ext): + result = read_dataframe(naturalearth_lowres_all_ext, use_arrow=True) + expected = read_dataframe(naturalearth_lowres_all_ext, use_arrow=False) + + if naturalearth_lowres_all_ext.suffix == ".gpkg": + fid_col = "fid" + else: + fid_col = "OGC_FID" + + assert fid_col in result.columns + result = result.drop(columns=[fid_col]) + + if naturalearth_lowres_all_ext.suffix.startswith(".geojson"): + check_less_precise = True + else: + check_less_precise = False + assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise) From d1a1153d4c0478819346b89f482c0b77c783641b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Sep 2022 15:43:19 +0200 Subject: [PATCH 05/18] install pyarrow on latest GDAL docker build test --- .github/workflows/docker-gdal.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/docker-gdal.yml b/.github/workflows/docker-gdal.yml index 245ef53b..5e8e4443 100644 --- a/.github/workflows/docker-gdal.yml +++ b/.github/workflows/docker-gdal.yml @@ -41,6 +41,11 @@ jobs: python3 -m pip install --no-cache-dir -U pip wheel python3 -m pip install --no-cache-dir -e .[dev,test,geopandas] + - name: Install pyarrow + if: matrix.container == "osgeo/gdal:ubuntu-small-latest" + run: | + python3 -m pip install pyarrow + - name: Test with pytest run: | pytest --cov=pyogrio --cov-report term-missing pyogrio/tests From eb5943dfc680599b3add55b1e72228fd037d37b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 20:35:43 +0200 Subject: [PATCH 06/18] support selecting columns and ignoring geometry --- pyogrio/_io.pyx | 20 +++++++++++++++----- pyogrio/_ogr.pxd | 1 + pyogrio/tests/test_arrow.py | 5 +++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 29ce6ccc..7939fc1b 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -953,8 +953,8 @@ def ogr_read_arrow( cdef const char *where_c = NULL cdef OGRDataSourceH ogr_dataset = NULL cdef OGRLayerH ogr_layer = NULL - cdef int feature_count = 0 - cdef double xmin, ymin, xmax, ymax + cdef char **fields_c = NULL + cdef const char *field_c = NULL cdef ArrowArrayStream stream cdef ArrowSchema schema @@ -994,11 +994,12 @@ def ogr_read_arrow( fields = get_fields(ogr_layer, encoding) + ignored_fields = [] if columns is not None: # Fields are matched exactly by name, duplicates are dropped. - # Find index of each field into fields - idx = np.intersect1d(fields[:,2], columns, return_indices=True)[1] - fields = fields[idx, :] + ignored_fields = list(set(fields[:,2]) - set(columns)) + if not read_geometry: + ignored_fields.append("OGR_GEOMETRY") geometry_type = get_geometry_type(ogr_layer) @@ -1020,6 +1021,15 @@ def ogr_read_arrow( ogr_layer, skip_features, max_features ) + # Limit to specified columns + if ignored_fields: + for field in ignored_fields: + field_b = field.encode("utf-8") + field_c = field_b + fields_c = CSLAddString(fields_c, field_c) + + OGR_L_SetIgnoredFields(ogr_layer, fields_c) + # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index b8106e1d..939b9021 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -283,6 +283,7 @@ cdef extern from "ogr_api.h": OGRErr OGR_L_SetNextByIndex(OGRLayerH layer, int nIndex) int OGR_L_GetFeatureCount(OGRLayerH layer, int m) void OGR_L_SetSpatialFilterRect(OGRLayerH layer, double xmin, double ymin, double xmax, double ymax) + OGRErr OGR_L_SetIgnoredFields(OGRLayerH layer, const char** fields) void OGRSetNonLinearGeometriesEnabledFlag(int bFlag) int OGRGetNonLinearGeometriesEnabledFlag() diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index a2909068..d9dab1eb 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -33,3 +33,8 @@ def test_read_arrow(naturalearth_lowres_all_ext): else: check_less_precise = False assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise) + + +def test_read_arrow_columns(naturalearth_lowres): + result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"]) + assert result.columns.tolist() == ["OGC_FID", "continent", "geometry"] From bfa8ae7b7094188855890f7442df6659d4b87629 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 22:24:11 +0200 Subject: [PATCH 07/18] fix reading without geometry --- pyogrio/geopandas.py | 9 ++++++--- pyogrio/tests/test_arrow.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 5c940471..712fb8e6 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -153,9 +153,12 @@ def read_dataframe( if use_arrow: table = index df = table.to_pandas() - geometry_name = meta["geometry_name"] or df.columns[-1] - df["geometry"] = from_wkb(df.pop(geometry_name), crs=meta["crs"]) - return gp.GeoDataFrame(df, geometry="geometry") + geometry_name = meta["geometry_name"] or "wkb_geometry" + if geometry_name in df.columns: + df["geometry"] = from_wkb(df.pop(geometry_name), crs=meta["crs"]) + return gp.GeoDataFrame(df, geometry="geometry") + else: + return df columns = meta["fields"].tolist() data = {columns[i]: field_data[i] for i in range(len(columns))} diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index d9dab1eb..4fb1e413 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -3,6 +3,8 @@ from pyogrio import __gdal_version__, read_dataframe try: + import pandas as pd + from pandas.testing import assert_frame_equal from geopandas.testing import assert_geodataframe_equal except ImportError: pass @@ -38,3 +40,18 @@ def test_read_arrow(naturalearth_lowres_all_ext): def test_read_arrow_columns(naturalearth_lowres): result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"]) assert result.columns.tolist() == ["OGC_FID", "continent", "geometry"] + + +def test_read_arrow_layer_without_geometry(test_fgdb_vsi): + result = read_dataframe(test_fgdb_vsi, layer="basetable", use_arrow=True) + assert type(result) is pd.DataFrame + + +def test_read_arrow_ignore_geometry(naturalearth_lowres): + result = read_dataframe(naturalearth_lowres, use_arrow=True, read_geometry=False) + assert type(result) is pd.DataFrame + + expected = read_dataframe(naturalearth_lowres, use_arrow=True).drop( + columns=["geometry"] + ) + assert_frame_equal(result, expected) From 41be9cc14acceb2e5918bfc845c242cbd1496686 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 22:36:36 +0200 Subject: [PATCH 08/18] avoid warning for nested types that are supported for arrow --- pyogrio/_io.pyx | 26 ++++++++++---------------- pyogrio/tests/test_arrow.py | 7 +++++++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 7939fc1b..41a408a0 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -303,7 +303,7 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): return None -cdef get_fields(OGRLayerH ogr_layer, str encoding): +cdef get_fields(OGRLayerH ogr_layer, str encoding, use_arrow=False): """Get field names and types for layer. Parameters @@ -311,6 +311,9 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding): ogr_layer : pointer to open OGR layer encoding : str encoding to use when reading field name + use_arrow : bool, default False + If using arrow, all types are supported, and we don't have to + raise warnings Returns ------- @@ -354,7 +357,7 @@ cdef get_fields(OGRLayerH ogr_layer, str encoding): field_type = OGR_Fld_GetType(ogr_fielddef) np_type = FIELD_TYPES[field_type] - if not np_type: + if not np_type and not use_arrow: skipped_fields = True log.warning( f"Skipping field {field_name}: unsupported OGR type: {field_type}") @@ -992,7 +995,7 @@ def ogr_read_arrow( or locale.getpreferredencoding() ) - fields = get_fields(ogr_layer, encoding) + fields = get_fields(ogr_layer, encoding, use_arrow=True) ignored_fields = [] if columns is not None: @@ -1028,7 +1031,7 @@ def ogr_read_arrow( field_c = field_b fields_c = CSLAddString(fields_c, field_c) - OGR_L_SetIgnoredFields(ogr_layer, fields_c) + OGR_L_SetIgnoredFields(ogr_layer, fields_c) # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -1046,17 +1049,6 @@ def ogr_read_arrow( import pyarrow as pa table = pa.RecordBatchStreamReader._import_from_c(stream_ptr).read_all() - # fid_data, geometries, field_data = get_features( - # ogr_layer, - # fields, - # encoding, - # read_geometry=read_geometry and geometry_type is not None, - # force_2d=force_2d, - # skip_features=skip_features, - # max_features=max_features, - # return_fids=return_fids - # ) - meta = { 'crs': crs, 'encoding': encoding, @@ -1066,7 +1058,9 @@ def ogr_read_arrow( } finally: - pass + if fields_c != NULL: + CSLDestroy(fields_c) + fields_c = NULL if ogr_dataset != NULL: if sql is not None: GDALDatasetReleaseResultSet(ogr_dataset, ogr_layer) diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 4fb1e413..eb71ec88 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -55,3 +55,10 @@ def test_read_arrow_ignore_geometry(naturalearth_lowres): columns=["geometry"] ) assert_frame_equal(result, expected) + + +def test_read_arrow_nested_types(test_ogr_types_list): + # with arrow, list types are supported + result = read_dataframe(test_ogr_types_list, use_arrow=True) + assert "list_int64" in result.columns + assert result["list_int64"][0].tolist() == [0, 1] From eabed97fb27df47f4de7eb484908bd6a7ccae88a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 22:40:06 +0200 Subject: [PATCH 09/18] simplify GDAL version check --- pyogrio/_io.pyx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 41a408a0..694fba0a 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1036,13 +1036,11 @@ def ogr_read_arrow( # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) - IF CTE_GDAL_VERSION >= (3, 6, 0): - - if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL): - raise RuntimeError("Failed to open ArrowArrayStream from Layer") - - ELSE: + IF CTE_GDAL_VERSION < (3, 6, 0): raise RuntimeError("Need GDAL>=3.6 for Arrow support") + + if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL): + raise RuntimeError("Failed to open ArrowArrayStream from Layer") stream_ptr = &stream From 5c039f8475c1574de083adf8c3929897eb175742 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 23:37:18 +0200 Subject: [PATCH 10/18] properly support return_fids --- pyogrio/_io.pyx | 7 ++++++- pyogrio/tests/test_arrow.py | 21 +++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 694fba0a..7951c3b0 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -958,6 +958,7 @@ def ogr_read_arrow( cdef OGRLayerH ogr_layer = NULL cdef char **fields_c = NULL cdef const char *field_c = NULL + cdef char **options = NULL cdef ArrowArrayStream stream cdef ArrowSchema schema @@ -1033,13 +1034,16 @@ def ogr_read_arrow( OGR_L_SetIgnoredFields(ogr_layer, fields_c) + if not return_fids: + options = CSLSetNameValue(options, "INCLUDE_FID", "NO") + # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) IF CTE_GDAL_VERSION < (3, 6, 0): raise RuntimeError("Need GDAL>=3.6 for Arrow support") - if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL): + if not OGR_L_GetArrowStream(ogr_layer, &stream, options): raise RuntimeError("Failed to open ArrowArrayStream from Layer") stream_ptr = &stream @@ -1056,6 +1060,7 @@ def ogr_read_arrow( } finally: + CSLDestroy(options) if fields_c != NULL: CSLDestroy(fields_c) fields_c = NULL diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index eb71ec88..14e4ce65 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -22,24 +22,29 @@ def test_read_arrow(naturalearth_lowres_all_ext): result = read_dataframe(naturalearth_lowres_all_ext, use_arrow=True) expected = read_dataframe(naturalearth_lowres_all_ext, use_arrow=False) + if naturalearth_lowres_all_ext.suffix.startswith(".geojson"): + check_less_precise = True + else: + check_less_precise = False + assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise) + + +def test_read_arrow_fid(naturalearth_lowres_all_ext): + result = read_dataframe( + naturalearth_lowres_all_ext, use_arrow=True, fid_as_index=True + ) + if naturalearth_lowres_all_ext.suffix == ".gpkg": fid_col = "fid" else: fid_col = "OGC_FID" assert fid_col in result.columns - result = result.drop(columns=[fid_col]) - - if naturalearth_lowres_all_ext.suffix.startswith(".geojson"): - check_less_precise = True - else: - check_less_precise = False - assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise) def test_read_arrow_columns(naturalearth_lowres): result = read_dataframe(naturalearth_lowres, use_arrow=True, columns=["continent"]) - assert result.columns.tolist() == ["OGC_FID", "continent", "geometry"] + assert result.columns.tolist() == ["continent", "geometry"] def test_read_arrow_layer_without_geometry(test_fgdb_vsi): From 74706f8887af26ea881de83010e59c0228d87a5c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 23:47:18 +0200 Subject: [PATCH 11/18] clean-up --- pyogrio/_io.pyx | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 7951c3b0..be8d5c78 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -966,12 +966,12 @@ def ogr_read_arrow( path_c = path_b if fids is not None: - if where is not None or bbox is not None or sql is not None or skip_features or max_features: - raise ValueError( - "cannot set both 'fids' and any of 'where', 'bbox', 'sql', " - "'skip_features' or 'max_features'" - ) - fids = np.asarray(fids, dtype=np.intc) + raise ValueError("reading by FID is not supported for Arrow") + + if skip_features or max_features: + raise ValueError( + "specifying 'skip_features' or 'max_features' is not supported for Arrow" + ) if sql is not None and layer is not None: raise ValueError("'sql' paramater cannot be combined with 'layer'") @@ -1009,9 +1009,6 @@ def ogr_read_arrow( geometry_name = get_string(OGR_L_GetGeometryColumn(ogr_layer)) - if fids is not None: - raise ValueError("reading by FID not supported for arrow") - # Apply the attribute filter if where is not None and where != "": apply_where_filter(ogr_layer, where) @@ -1020,11 +1017,6 @@ def ogr_read_arrow( if bbox is not None: apply_spatial_filter(ogr_layer, bbox) - # Limit feature range to available range - skip_features, max_features = validate_feature_range( - ogr_layer, skip_features, max_features - ) - # Limit to specified columns if ignored_fields: for field in ignored_fields: @@ -1048,6 +1040,7 @@ def ogr_read_arrow( stream_ptr = &stream + # stream has to be consumed before the Dataset is closed import pyarrow as pa table = pa.RecordBatchStreamReader._import_from_c(stream_ptr).read_all() From 755b43bd3fc33a97b437c493c68e5fce3f3abd2e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Sep 2022 23:53:39 +0200 Subject: [PATCH 12/18] synchronize arrow definitions with latest upstream version --- pyogrio/arrow_bridge.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pyogrio/arrow_bridge.h b/pyogrio/arrow_bridge.h index d269e765..920d498a 100644 --- a/pyogrio/arrow_bridge.h +++ b/pyogrio/arrow_bridge.h @@ -16,18 +16,20 @@ // under the License. // This file is an extract https://github.com/apache/arrow/blob/master/cpp/src/arrow/c/abi.h +// commit 9cbb8a1a626ee301cfe85905b6c18c5d880e176b (2022-06-14) // WARNING: DO NOT MODIFY the content as it would break interoperability ! #pragma once -/*! @cond Doxygen_Suppress */ - #include #ifdef __cplusplus extern "C" { #endif +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + #define ARROW_FLAG_DICTIONARY_ORDERED 1 #define ARROW_FLAG_NULLABLE 2 #define ARROW_FLAG_MAP_KEYS_SORTED 4 @@ -64,7 +66,11 @@ struct ArrowArray { // Opaque producer-specific data void* private_data; }; -// EXPERIMENTAL: C stream interface + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { // Callback to get the stream type @@ -102,8 +108,8 @@ struct ArrowArrayStream { void* private_data; }; +#endif // ARROW_C_STREAM_INTERFACE + #ifdef __cplusplus } #endif - -/*! @endcond */ From fd21ad53d4852dd2c9719f650103783262401900 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Sep 2022 13:53:18 +0200 Subject: [PATCH 13/18] raise error if force_2d is used --- pyogrio/_io.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 98d4849d..033ba5e3 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -990,6 +990,9 @@ def ogr_read_arrow( path_b = path.encode('utf-8') path_c = path_b + if force_2d: + raise ValueError("forcing 2D is not supported for Arrow") + if fids is not None: raise ValueError("reading by FID is not supported for Arrow") From f80278c92a5cc95140cffdef90696e4bdc9ebb35 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 18 Dec 2022 14:58:57 +0100 Subject: [PATCH 14/18] add pyarrow to conda test environment --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index db54389f..bf569119 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,3 +7,5 @@ dependencies: - pytest - pygeos - geopandas-base + - pyarrow + From 7759a47d4eb19ad7ad6c76ae3eaa35ce901c797e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 18 Dec 2022 15:01:12 +0100 Subject: [PATCH 15/18] fix linting --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 54746557..1897363e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,8 +5,8 @@ repos: hooks: - id: black language_version: python3 - - repo: https://gitlab.com/pycqa/flake8 - rev: 4.0.1 +- repo: https://github.com/pycqa/flake8 + rev: 5.0.4 hooks: - id: flake8 language: python_venv From fd6e175087efbff227f931d501bd2484066602ee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 18 Dec 2022 15:02:36 +0100 Subject: [PATCH 16/18] fixup linting --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1897363e..a62ac8e1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: hooks: - id: black language_version: python3 -- repo: https://github.com/pycqa/flake8 + - repo: https://github.com/pycqa/flake8 rev: 5.0.4 hooks: - id: flake8 From dff16ee922b5ddf2cf8e07f1787a95abcf8ab2d2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 18 Dec 2022 15:11:46 +0100 Subject: [PATCH 17/18] include arrow_bridge.h in sdists --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 9f14d284..02e7dd2a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include versioneer.py include pyogrio/_version.py include pyogrio/*.pyx pyogrio/*.pxd +include pyogrio/arrow_bridge.h exclude pyogrio/*.c recursive-include pyogrio/tests/fixtures * \ No newline at end of file From 26d09107c8540c011b9d367e406912c966e453a6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 18 Dec 2022 18:52:19 +0100 Subject: [PATCH 18/18] mention in docs --- README.md | 3 +++ docs/source/install.md | 3 +++ 2 files changed, 6 insertions(+) diff --git a/README.md b/README.md index b13b08e2..ec637a51 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,9 @@ Supports Python 3.8 - 3.10 and GDAL 3.1.x - 3.5.x. Reading to GeoDataFrames requires requires `geopandas>=0.8` with `pygeos` enabled. +Additionally, installing `pyarrow` in combination with GDAL 3.6+ enables +a further speed-up when specifying `use_arrow=True`. + ## Installation Pyogrio is currently available on diff --git a/docs/source/install.md b/docs/source/install.md index ef1d19e4..375e244d 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -6,6 +6,9 @@ Supports Python 3.8 - 3.10 and GDAL 3.1.x - 3.5.x Reading to GeoDataFrames requires requires `geopandas>=0.8` with `pygeos` enabled. +Additionally, installing `pyarrow` in combination with GDAL 3.6+ enables +a further speed-up when specifying `use_arrow=True`. + ## Installation ### Conda-forge