Open
Description
I'm running into an error where drop_duplicates
fails if the geodataframe originates from a parquet file and the spatial partitions are encoded in the file. As far as I can tell, drop_duplicates
works if those two conditions are not true.
import dask.dataframe as dd
import dask_geopandas as dgpd
import geopandas as gpd
import numpy as np
dfs = []
N = 5
for i in range(3):
gs = gpd.points_from_xy(np.arange(N), np.arange(N), crs=5070)
df = gpd.GeoDataFrame(
{"col1": np.full(N, i), "col2": np.arange(N), "geometry": gs}
)
dfs.append(dgpd.from_geopandas(df, npartitions=1))
ddf = dd.concat(dfs)
ddf.calculate_spatial_partitions
ddf.to_parquet("test.pqt")
ddf = dgpd.read_parquet("test.pqt")
ddf.drop_duplicates(subset=["col1", "col2"]).compute()
Traceback (most recent call last):
File "/var/mnt/fastdata02/mtbs/test2.py", line 18, in <module>
ddf.drop_duplicates(subset=["col1", "col2"]).compute()
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_collection.py", line 488, in compute
out = out.optimize(fuse=fuse)
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask_geopandas/expr.py", line 144, in optimize
result = new_collection(self.expr.optimize(fuse=fuse))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_expr.py", line 94, in optimize
return optimize(self, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/_expr.py", line 775, in optimize
return optimize_until(expr, stage)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/_expr.py", line 794, in optimize_until
expr = expr.lower_completely()
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/_expr.py", line 450, in lower_completely
new = expr.lower_once(lowered)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/_expr.py", line 416, in lower_once
new = operand.lower_once(lowered)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/_expr.py", line 416, in lower_once
new = operand.lower_once(lowered)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/_expr.py", line 405, in lower_once
out = expr._lower()
^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_shuffle.py", line 292, in _lower
[c for c in shuffled.columns if c not in ["_partitions", "_partitions_0"]]
^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_expr.py", line 471, in columns
return list(self._meta.columns)
^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/functools.py", line 998, in __get__
val = self.func(instance)
^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_shuffle.py", line 169, in _meta
meta = self.frame._meta
^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/functools.py", line 998, in __get__
val = self.func(instance)
^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_expr.py", line 567, in _meta
return self.operation(*args, **self._kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/dask/dataframe/dask_expr/_shuffle.py", line 763, in operation
index = index.astype(dtypes, errors="ignore")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/pandas/core/generic.py", line 6668, in astype
result = concat(results, axis=1, copy=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/pandas/core/reshape/concat.py", line 395, in concat
return op.get_result()
^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/pandas/core/reshape/concat.py", line 662, in get_result
return df.__finalize__(self, method="concat")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/pandas/core/generic.py", line 6288, in __finalize__
all(obj.attrs["spatial_partitions"] == attrs["spatial_partitions"] for obj in other.objs[1:])
File "/home/fred/homes/rts/anaconda3/envs/rts/lib/python3.12/site-packages/pandas/core/generic.py", line 1577, in __nonzero__
raise ValueError(
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Environment:
dask 2025.2.0 pyhd8ed1ab_0 conda-forge
dask-core 2025.2.0 pyhd8ed1ab_0 conda-forge
dask-geopandas 0.4.3 pyhd8ed1ab_0 conda-forge
geopandas 1.0.1 pyhd8ed1ab_3 conda-forge
geopandas-base 1.0.1 pyha770c72_3 conda-forge
numpy 2.1.3 py312h58c1407_0 conda-forge
pandas 2.2.3 py312hf9745cd_1 conda-forge
Metadata
Metadata
Assignees
Labels
No labels