Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
cf092b2
ENH: use new columnar GetArrowStream if GDAL>=3.6 and pyarrow available
jorisvandenbossche Aug 23, 2022
13d4b61
use compile_time variable
jorisvandenbossche Sep 8, 2022
4eaba0d
Check if pyarrow is available
jorisvandenbossche Sep 8, 2022
9389dc4
add basic test
jorisvandenbossche Sep 8, 2022
d1a1153
install pyarrow on latest GDAL docker build test
jorisvandenbossche Sep 8, 2022
eb5943d
support selecting columns and ignoring geometry
jorisvandenbossche Sep 21, 2022
bfa8ae7
fix reading without geometry
jorisvandenbossche Sep 21, 2022
41be9cc
avoid warning for nested types that are supported for arrow
jorisvandenbossche Sep 21, 2022
eabed97
simplify GDAL version check
jorisvandenbossche Sep 21, 2022
5c039f8
properly support return_fids
jorisvandenbossche Sep 21, 2022
74706f8
clean-up
jorisvandenbossche Sep 21, 2022
c643c5d
Merge remote-tracking branch 'upstream/main' into gdal-arrow-stream
jorisvandenbossche Sep 21, 2022
755b43b
synchronize arrow definitions with latest upstream version
jorisvandenbossche Sep 21, 2022
fd21ad5
raise error if force_2d is used
jorisvandenbossche Sep 23, 2022
a67ec7f
Merge remote-tracking branch 'upstream/main' into gdal-arrow-stream
jorisvandenbossche Dec 18, 2022
f80278c
add pyarrow to conda test environment
jorisvandenbossche Dec 18, 2022
7759a47
fix linting
jorisvandenbossche Dec 18, 2022
fd6e175
fixup linting
jorisvandenbossche Dec 18, 2022
dff16ee
include arrow_bridge.h in sdists
jorisvandenbossche Dec 18, 2022
7320702
Merge remote-tracking branch 'upstream/main' into gdal-arrow-stream
jorisvandenbossche Dec 18, 2022
26d0910
mention in docs
jorisvandenbossche Dec 18, 2022
616f090
Merge remote-tracking branch 'upstream/main' into gdal-arrow-stream
jorisvandenbossche Jan 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/docker-gdal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ jobs:
python3 -m pip install --no-cache-dir -U pip wheel
python3 -m pip install --no-cache-dir -e .[dev,test,geopandas]

- name: Install pyarrow
if: matrix.container == "osgeo/gdal:ubuntu-small-latest"
run: |
python3 -m pip install pyarrow

- name: Test with pytest
run: |
pytest --cov=pyogrio --cov-report term-missing pyogrio/tests
143 changes: 142 additions & 1 deletion pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import math
import os
import warnings

from libc.stdint cimport uint8_t
from libc.stdint cimport uint8_t, uintptr_t
from libc.stdlib cimport malloc, free
from libc.string cimport strlen
from libc.math cimport isnan
Expand Down Expand Up @@ -931,6 +931,147 @@ def ogr_read(
)


def ogr_read_arrow(
str path,
object layer=None,
object encoding=None,
int read_geometry=True,
int force_2d=False,
object columns=None,
int skip_features=0,
int max_features=0,
object where=None,
tuple bbox=None,
object fids=None,
str sql=None,
str sql_dialect=None,
int return_fids=False,
**kwargs):

cdef int err = 0
cdef const char *path_c = NULL
cdef const char *where_c = NULL
cdef OGRDataSourceH ogr_dataset = NULL
cdef OGRLayerH ogr_layer = NULL
cdef int feature_count = 0
cdef double xmin, ymin, xmax, ymax
cdef ArrowArrayStream stream
cdef ArrowSchema schema

path_b = path.encode('utf-8')
path_c = path_b

if fids is not None:
if where is not None or bbox is not None or sql is not None or skip_features or max_features:
raise ValueError(
"cannot set both 'fids' and any of 'where', 'bbox', 'sql', "
"'skip_features' or 'max_features'"
)
fids = np.asarray(fids, dtype=np.intc)

if sql is not None and layer is not None:
raise ValueError("'sql' paramater cannot be combined with 'layer'")

ogr_dataset = ogr_open(path_c, 0, kwargs)
try:
if sql is None:
# layer defaults to index 0
if layer is None:
layer = 0
ogr_layer = get_ogr_layer(ogr_dataset, layer)
else:
ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect)

crs = get_crs(ogr_layer)

# Encoding is derived from the user, from the dataset capabilities / type,
# or from the system locale
encoding = (
encoding
or detect_encoding(ogr_dataset, ogr_layer)
or locale.getpreferredencoding()
)

fields = get_fields(ogr_layer, encoding)

if columns is not None:
# Fields are matched exactly by name, duplicates are dropped.
# Find index of each field into fields
idx = np.intersect1d(fields[:,2], columns, return_indices=True)[1]
fields = fields[idx, :]

geometry_type = get_geometry_type(ogr_layer)

geometry_name = get_string(OGR_L_GetGeometryColumn(ogr_layer))

if fids is not None:
raise ValueError("reading by FID not supported for arrow")

# Apply the attribute filter
if where is not None and where != "":
apply_where_filter(ogr_layer, where)

# Apply the spatial filter
if bbox is not None:
apply_spatial_filter(ogr_layer, bbox)

# Limit feature range to available range
skip_features, max_features = validate_feature_range(
ogr_layer, skip_features, max_features
)

# make sure layer is read from beginning
OGR_L_ResetReading(ogr_layer)

IF CTE_GDAL_VERSION >= (3, 6, 0):

if not OGR_L_GetArrowStream(ogr_layer, &stream, NULL):
raise RuntimeError("Failed to open ArrowArrayStream from Layer")

ELSE:
raise RuntimeError("Need GDAL>=3.6 for Arrow support")

stream_ptr = <uintptr_t> &stream

import pyarrow as pa
table = pa.RecordBatchStreamReader._import_from_c(stream_ptr).read_all()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part is actually the main piece that is different from the existing ogr_read function. So it might be cleaner to integrate it into that one (to avoid too much duplication of code) by passing through the use_arrow keyword to ogr_read.
On the other hand, that makes the return value a bit messy (unless we would change this in a nested tuples of length 2 ((meta, data) where data can be (fid_data, geometries, field_data) in the standard case and table in the arrow case). And the arrow code path is for now not handling all keywords, so that might also be cleaner as a separate function.


# fid_data, geometries, field_data = get_features(
# ogr_layer,
# fields,
# encoding,
# read_geometry=read_geometry and geometry_type is not None,
# force_2d=force_2d,
# skip_features=skip_features,
# max_features=max_features,
# return_fids=return_fids
# )

meta = {
'crs': crs,
'encoding': encoding,
'fields': fields[:,2], # return only names
'geometry_type': geometry_type,
'geometry_name': geometry_name,
}

finally:
pass
if ogr_dataset != NULL:
if sql is not None:
GDALDatasetReleaseResultSet(ogr_dataset, ogr_layer)

GDALClose(ogr_dataset)
ogr_dataset = NULL

return (
meta,
table,
None, #geometries,
None, #field_data
)


def ogr_read_bounds(
str path,
object layer=None,
Expand Down
17 changes: 16 additions & 1 deletion pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Contains declarations against GDAL / OGR API
from libc.stdint cimport int64_t
from libc.stdint cimport int64_t, int8_t
from libc.stdio cimport FILE


Expand Down Expand Up @@ -184,6 +184,14 @@ cdef extern from "ogr_srs_api.h":
void OSRRelease(OGRSpatialReferenceH srs)


cdef extern from "arrow_bridge.h":
struct ArrowSchema:
int64_t n_children

struct ArrowArrayStream:
int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out)


cdef extern from "ogr_api.h":
int OGRGetDriverCount()
OGRSFDriverH OGRGetDriver(int)
Expand Down Expand Up @@ -264,6 +272,7 @@ cdef extern from "ogr_api.h":
OGRErr OGR_L_CreateFeature(OGRLayerH layer, OGRFeatureH feature)
OGRErr OGR_L_CreateField(OGRLayerH layer, OGRFieldDefnH fielddefn, int flexible)
const char* OGR_L_GetName(OGRLayerH layer)
const char* OGR_L_GetGeometryColumn(OGRLayerH layer)
OGRSpatialReferenceH OGR_L_GetSpatialRef(OGRLayerH layer)
int OGR_L_TestCapability(OGRLayerH layer, const char *name)
OGRFeatureDefnH OGR_L_GetLayerDefn(OGRLayerH layer)
Expand All @@ -286,6 +295,12 @@ cdef extern from "ogr_api.h":
const char* OLCFastSpatialFilter


IF CTE_GDAL_VERSION >= (3, 6, 0):

cdef extern from "ogr_api.h":
int8_t OGR_L_GetArrowStream(OGRLayerH hLayer, ArrowArrayStream *out_stream, char** papszOptions)


cdef extern from "gdal.h":
ctypedef enum GDALDataType:
GDT_Unknown
Expand Down
109 changes: 109 additions & 0 deletions pyogrio/arrow_bridge.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// This file is an extract https://github.com/apache/arrow/blob/master/cpp/src/arrow/c/abi.h
// WARNING: DO NOT MODIFY the content as it would break interoperability !

#pragma once

/*! @cond Doxygen_Suppress */

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4

struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;

// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};

struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;

// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};
// EXPERIMENTAL: C stream interface

struct ArrowArrayStream {
// Callback to get the stream type
// (will be the same for all arrays in the stream).
//
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
//
// If successful, the ArrowSchema must be released independently from the stream.
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);

// Callback to get the next array
// (if no error and the array is released, the stream has ended)
//
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
//
// If successful, the ArrowArray must be released independently from the stream.
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);

// Callback to get optional detailed error information.
// This must only be called if the last stream operation failed
// with a non-0 return code.
//
// Return value: pointer to a null-terminated character array describing
// the last error, or NULL if no description is available.
//
// The returned pointer is only valid until the next operation on this stream
// (including release).
const char* (*get_last_error)(struct ArrowArrayStream*);

// Release callback: release the stream's own resources.
// Note that arrays returned by `get_next` must be individually released.
void (*release)(struct ArrowArrayStream*);

// Opaque producer-specific data
void* private_data;
};

#ifdef __cplusplus
}
#endif

/*! @endcond */
9 changes: 9 additions & 0 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def read_dataframe(
sql=None,
sql_dialect=None,
fid_as_index=False,
use_arrow=False,
):
"""Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame.
If the data source does not have a geometry column or ``read_geometry`` is False,
Expand Down Expand Up @@ -146,8 +147,16 @@ def read_dataframe(
sql=sql,
sql_dialect=sql_dialect,
return_fids=fid_as_index,
use_arrow=use_arrow,
)

if use_arrow:
table = index
df = table.to_pandas()
geometry_name = meta["geometry_name"] or df.columns[-1]
df["geometry"] = from_wkb(df.pop(geometry_name), crs=meta["crs"])
return gp.GeoDataFrame(df, geometry="geometry")

columns = meta["fields"].tolist()
data = {columns[i]: field_data[i] for i in range(len(columns))}
if fid_as_index:
Expand Down
15 changes: 13 additions & 2 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pyogrio.util import get_vsi_path

with GDALEnv():
from pyogrio._io import ogr_read, ogr_write
from pyogrio._io import ogr_read, ogr_read_arrow, ogr_write
from pyogrio._ogr import remove_virtual_file


Expand Down Expand Up @@ -42,6 +42,7 @@ def read(
sql=None,
sql_dialect=None,
return_fids=False,
use_arrow=False,
):
"""Read OGR data source.

Expand Down Expand Up @@ -114,7 +115,17 @@ def read(
path, buffer = get_vsi_path(path_or_buffer)

try:
result = ogr_read(
if use_arrow:
try:
import pyarrow # noqa
except ImportError:
raise RuntimeError(
"the 'pyarrow' package is required to read using arrow"
)
func = ogr_read_arrow
else:
func = ogr_read
result = func(
path,
layer=layer,
encoding=encoding,
Expand Down
Loading