From 0c9b6e10fd25ed8d5730bc5663507e82d73d401e Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 10:50:01 +0000 Subject: [PATCH 1/8] [ENH] add overlay.py work file --- dask_geopandas/overlay.py | 107 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 dask_geopandas/overlay.py diff --git a/dask_geopandas/overlay.py b/dask_geopandas/overlay.py new file mode 100644 index 00000000..6dc7cd4c --- /dev/null +++ b/dask_geopandas/overlay.py @@ -0,0 +1,107 @@ +import warnings + +import numpy as np +import geopandas + +from dask.base import tokenize +from dask.highlevelgraph import HighLevelGraph + +from .core import from_geopandas, GeoDataFrame + + +def overlay(df1, df2, how="intersection", **kwargs): + """ + Overlay of two GeoDataFrames. + + Parameters + ---------- + df1, df2 : geopandas or dask_geopandas GeoDataFrames + If a geopandas.GeoDataFrame is passed, it is considered as a + dask_geopandas.GeoDataFrame with 1 partition (without spatial + partitioning information). + how : string, default 'intersection' + Method of spatial overlay: ‘intersection’, ‘union’, ‘identity’, + ‘symmetric_difference’ or ‘difference’. + keep_geom_type : bool + If True, return only geometries of the same geometry type as df1 has, + if False, return all resulting geometries. Default is None, which will + set keep_geom_type to True but warn upon dropping geometries. + make_validbool : default True + If True, any invalid input geometries are corrected with a call to buffer(0), + if False, a ValueError is raised if any input geometries are invalid. + + Returns + ------- + dask_geopandas.GeoDataFrame + + Notes + ----- + If both the df1 and df2 GeoDataFrame have spatial partitioning + information available (the ``spatial_partitions`` attribute is set), + the output partitions are determined based on intersection of the + spatial partitions. In all other cases, the output partitions are + all combinations (cartesian/cross product) of all input partition + of the df1 and df2 GeoDataFrame. + """ + if "op" in kwargs: + predicate = kwargs.pop("op") + deprecation_message = ( + "The `op` parameter is deprecated and will be removed" + " in a future release. Please use the `predicate` parameter" + " instead." + ) + warnings.warn(deprecation_message, FutureWarning, stacklevel=2) + if how != "inner": + raise NotImplementedError("Only how='inner' is supported df2 now") + + if isinstance(df1, geopandas.GeoDataFrame): + df1 = from_geopandas(df1, npartitions=1) + if isinstance(df2, geopandas.GeoDataFrame): + df2 = from_geopandas(df2, npartitions=1) + + name = "overlay-" + tokenize(df1, df2, how) + meta = geopandas.overlay(df1._meta, df2._meta, how=how) + + if df1.spatial_partitions is not None and df2.spatial_partitions is not None: + # Spatial partitions are known -> use them to trim down the list of + # partitions that need to be joined + parts = geopandas.overlay( + df1.spatial_partitions.to_frame("geometry"), + df2.spatial_partitions.to_frame("geometry"), + how="intersects" + ) + parts_df1 = np.asarray(parts.index) + parts_df2 = np.asarray(parts["index_df2"].values) + using_spatial_partitions = True + else: + # Unknown spatial partitions -> full cartesian (cross) product of all + # combinations of the partitions of the df1 and df2 dataframe + n_df1 = df1.npartitions + n_df2 = df2.npartitions + parts_df1 = np.repeat(np.arange(n_df1), n_df2) + parts_df2 = np.tile(np.arange(n_df2), n_df1) + using_spatial_partitions = False + + dsk = {} + new_spatial_partitions = [] + for i, (l, r) in enumerate(zip(parts_df1, parts_df2)): + dsk[(name, i)] = ( + geopandas.overlay, + (df1._name, l), + (df2._name, r), + how, + ) + # TODO preserve spatial partitions of the output if only df1 has spatial + # partitions + if using_spatial_partitions: + lr = df1.spatial_partitions.iloc[l] + rr = df2.spatial_partitions.iloc[r] + # extent = lr.intersection(rr).buffer(buffer).intersection(lr.union(rr)) + extent = lr.intersection(rr) + new_spatial_partitions.append(extent) + + divisions = [None] * (len(dsk) + 1) + graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df1, df2]) + if not using_spatial_partitions: + new_spatial_partitions = None + return GeoDataFrame(graph, name, meta, divisions, new_spatial_partitions) From 22eb07e15094610e407e585a4137f08bf9866078 Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 11:16:11 +0000 Subject: [PATCH 2/8] [ENH] add overlay.py to __init__ --- dask_geopandas/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dask_geopandas/__init__.py b/dask_geopandas/__init__.py index 96030ae7..b23a5ce8 100644 --- a/dask_geopandas/__init__.py +++ b/dask_geopandas/__init__.py @@ -13,6 +13,7 @@ from .io.parquet import read_parquet, to_parquet from .io.arrow import read_feather, to_feather from .sjoin import sjoin +from .overlay import overlay __version__ = get_versions()["version"] @@ -31,4 +32,5 @@ "to_parquet", "clip", "sjoin", + "overlay" ] From 9bc1da779d5b66fa91ab7fca231e3a86808415f3 Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 11:28:50 +0000 Subject: [PATCH 3/8] [DOC] change to overlay doc-string --- dask_geopandas/overlay.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/dask_geopandas/overlay.py b/dask_geopandas/overlay.py index 6dc7cd4c..d4655907 100644 --- a/dask_geopandas/overlay.py +++ b/dask_geopandas/overlay.py @@ -43,14 +43,7 @@ def overlay(df1, df2, how="intersection", **kwargs): all combinations (cartesian/cross product) of all input partition of the df1 and df2 GeoDataFrame. """ - if "op" in kwargs: - predicate = kwargs.pop("op") - deprecation_message = ( - "The `op` parameter is deprecated and will be removed" - " in a future release. Please use the `predicate` parameter" - " instead." - ) - warnings.warn(deprecation_message, FutureWarning, stacklevel=2) + if how != "inner": raise NotImplementedError("Only how='inner' is supported df2 now") @@ -65,10 +58,11 @@ def overlay(df1, df2, how="intersection", **kwargs): if df1.spatial_partitions is not None and df2.spatial_partitions is not None: # Spatial partitions are known -> use them to trim down the list of # partitions that need to be joined - parts = geopandas.overlay( + parts = geopandas.sjoin( df1.spatial_partitions.to_frame("geometry"), df2.spatial_partitions.to_frame("geometry"), - how="intersects" + how="inner", + predicate="intersects", ) parts_df1 = np.asarray(parts.index) parts_df2 = np.asarray(parts["index_df2"].values) From 1d63a8724fc736a074ffe38d1671e673c965cc4e Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 13:34:09 +0000 Subject: [PATCH 4/8] [ENH] convert partitions to geoseries --- dask_geopandas/overlay.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_geopandas/overlay.py b/dask_geopandas/overlay.py index d4655907..1af3c4fa 100644 --- a/dask_geopandas/overlay.py +++ b/dask_geopandas/overlay.py @@ -65,7 +65,7 @@ def overlay(df1, df2, how="intersection", **kwargs): predicate="intersects", ) parts_df1 = np.asarray(parts.index) - parts_df2 = np.asarray(parts["index_df2"].values) + parts_df2 = np.asarray(parts["index_right"].values) using_spatial_partitions = True else: # Unknown spatial partitions -> full cartesian (cross) product of all @@ -96,6 +96,8 @@ def overlay(df1, df2, how="intersection", **kwargs): divisions = [None] * (len(dsk) + 1) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df1, df2]) - if not using_spatial_partitions: + if using_spatial_partitions: + new_spatial_partitions = geopandas.GeoSeries(data=new_spatial_partitions) + else: new_spatial_partitions = None return GeoDataFrame(graph, name, meta, divisions, new_spatial_partitions) From 4fcce05375c78748f5030d47e80f4716ace212d4 Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 13:47:45 +0000 Subject: [PATCH 5/8] [BUG] pass crs through partition geoseries --- dask_geopandas/overlay.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dask_geopandas/overlay.py b/dask_geopandas/overlay.py index 1af3c4fa..48d7741c 100644 --- a/dask_geopandas/overlay.py +++ b/dask_geopandas/overlay.py @@ -44,9 +44,6 @@ def overlay(df1, df2, how="intersection", **kwargs): of the df1 and df2 GeoDataFrame. """ - if how != "inner": - raise NotImplementedError("Only how='inner' is supported df2 now") - if isinstance(df1, geopandas.GeoDataFrame): df1 = from_geopandas(df1, npartitions=1) if isinstance(df2, geopandas.GeoDataFrame): @@ -97,7 +94,7 @@ def overlay(df1, df2, how="intersection", **kwargs): divisions = [None] * (len(dsk) + 1) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df1, df2]) if using_spatial_partitions: - new_spatial_partitions = geopandas.GeoSeries(data=new_spatial_partitions) + new_spatial_partitions = geopandas.GeoSeries(data=new_spatial_partitions, crs=df1.crs) else: new_spatial_partitions = None return GeoDataFrame(graph, name, meta, divisions, new_spatial_partitions) From f0b049999dec37e8ee2067abea9173ce282d1427 Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 16:07:24 +0000 Subject: [PATCH 6/8] [DOC] lint overlay.py --- dask_geopandas/__init__.py | 2 +- dask_geopandas/overlay.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_geopandas/__init__.py b/dask_geopandas/__init__.py index b23a5ce8..85385dda 100644 --- a/dask_geopandas/__init__.py +++ b/dask_geopandas/__init__.py @@ -32,5 +32,5 @@ "to_parquet", "clip", "sjoin", - "overlay" + "overlay", ] diff --git a/dask_geopandas/overlay.py b/dask_geopandas/overlay.py index 48d7741c..b907b6d9 100644 --- a/dask_geopandas/overlay.py +++ b/dask_geopandas/overlay.py @@ -94,7 +94,9 @@ def overlay(df1, df2, how="intersection", **kwargs): divisions = [None] * (len(dsk) + 1) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df1, df2]) if using_spatial_partitions: - new_spatial_partitions = geopandas.GeoSeries(data=new_spatial_partitions, crs=df1.crs) + new_spatial_partitions = geopandas.GeoSeries( + data=new_spatial_partitions, crs=df1.crs + ) else: new_spatial_partitions = None return GeoDataFrame(graph, name, meta, divisions, new_spatial_partitions) From 167099a5d1e470fb1e74eeba09e057f05bc0db58 Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 16:32:08 +0000 Subject: [PATCH 7/8] [TST] add test_overlay.py --- dask_geopandas/tests/test_overlay.py | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 dask_geopandas/tests/test_overlay.py diff --git a/dask_geopandas/tests/test_overlay.py b/dask_geopandas/tests/test_overlay.py new file mode 100644 index 00000000..e6c5fd7e --- /dev/null +++ b/dask_geopandas/tests/test_overlay.py @@ -0,0 +1,52 @@ +import pytest + +import geopandas +from geopandas.testing import assert_geodataframe_equal + +import dask_geopandas + + +def test_overlay_dask_geopandas(): + world = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres")) + capitals = geopandas.read_file(geopandas.datasets.get_path("naturalearth_cities")) + + # Select South America and some columns + countries = world[world["continent"] == "South America"] + countries = countries[["geometry", "name"]] + + # Project to crs that uses meters as distance measure + countries = countries.to_crs("epsg:3395") + capitals = capitals.to_crs("epsg:3395") + + ddf_countries = dask_geopandas.from_geopandas(countries, npartitions=4) + ddf_capitals = dask_geopandas.from_geopandas(capitals, npartitions=4) + + expected = geopandas.overlay(capitals, countries) + expected = expected.sort_values("name_1").reset_index(drop=True) + + # dask / geopandas + result = dask_geopandas.overlay(ddf_capitals, countries) + assert_geodataframe_equal( + expected, result.compute().sort_values("name_1").reset_index(drop=True) + ) + + # geopandas / dask + result = dask_geopandas.overlay(capitals, ddf_countries) + assert_geodataframe_equal( + expected, result.compute().sort_values("name_1").reset_index(drop=True) + ) + + # dask / dask + result = dask_geopandas.overlay(ddf_capitals, ddf_countries) + assert_geodataframe_equal( + expected, result.compute().sort_values("name_1").reset_index(drop=True) + ) + + # with spatial_partitions + ddf_countries.calculate_spatial_partitions() + ddf_capitals.calculate_spatial_partitions() + result = dask_geopandas.overlay(ddf_capitals, ddf_countries) + assert isinstance(result.spatial_partitions, geopandas.GeoSeries) + assert_geodataframe_equal( + expected, result.compute().sort_values("name_1").reset_index(drop=True) + ) From a514a7b4f074c91f6a5d573cd77efce584bae34a Mon Sep 17 00:00:00 2001 From: Stefanie Lumnitz Date: Sat, 27 Aug 2022 16:34:33 +0000 Subject: [PATCH 8/8] [DOC] linting tests --- dask_geopandas/overlay.py | 2 -- dask_geopandas/tests/test_overlay.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/dask_geopandas/overlay.py b/dask_geopandas/overlay.py index b907b6d9..f3a6a05a 100644 --- a/dask_geopandas/overlay.py +++ b/dask_geopandas/overlay.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np import geopandas diff --git a/dask_geopandas/tests/test_overlay.py b/dask_geopandas/tests/test_overlay.py index e6c5fd7e..70a1daf1 100644 --- a/dask_geopandas/tests/test_overlay.py +++ b/dask_geopandas/tests/test_overlay.py @@ -1,5 +1,3 @@ -import pytest - import geopandas from geopandas.testing import assert_geodataframe_equal