From 3a8f4e6921d4fe06a429bf9d9030d01beddb145a Mon Sep 17 00:00:00 2001 From: leostimpfle Date: Thu, 2 Jul 2026 23:05:29 +0200 Subject: [PATCH] Add support for path-like file_name in to_csv and to_parquet --- _duckdb-stubs/__init__.pyi | 10 ++++----- src/duckdb_python.cpp | 2 +- src/include/duckdb_python/path_like.hpp | 3 +++ src/include/duckdb_python/pyrelation.hpp | 4 ++-- src/path_like.cpp | 26 ++++++++++++++++++++---- src/pyrelation.cpp | 18 +++++++++------- tests/fast/api/test_to_csv.py | 13 ++++++++++++ tests/fast/api/test_to_parquet.py | 13 ++++++++++++ 8 files changed, 70 insertions(+), 19 deletions(-) diff --git a/_duckdb-stubs/__init__.pyi b/_duckdb-stubs/__init__.pyi index 8770483f..d48ccc25 100644 --- a/_duckdb-stubs/__init__.pyi +++ b/_duckdb-stubs/__init__.pyi @@ -707,7 +707,7 @@ class DuckDBPyRelation: def tf(self) -> dict[str, typing.Any]: ... def to_csv( self, - file_name: str, + file_name: str | os.PathLike[str], *, sep: str | None = None, na_rep: str | None = None, @@ -728,7 +728,7 @@ class DuckDBPyRelation: def to_df(self, *, date_as_object: bool = False) -> pandas.DataFrame: ... def to_parquet( self, - file_name: str, + file_name: str | os.PathLike[str], *, compression: ParquetCompression | None = None, field_ids: ParquetFieldsOptions | None = None, @@ -764,7 +764,7 @@ class DuckDBPyRelation: ) -> DuckDBPyRelation: ... def write_csv( self, - file_name: str, + file_name: str | os.PathLike[str], *, sep: str | None = None, na_rep: str | None = None, @@ -784,7 +784,7 @@ class DuckDBPyRelation: ) -> None: ... def write_parquet( self, - file_name: str, + file_name: str | os.PathLike[str], *, compression: ParquetCompression | None = None, field_ids: ParquetFieldsOptions | None = None, @@ -1272,7 +1272,7 @@ def values(*args: IntoValues, connection: DuckDBPyConnection | None = None) -> D def view(view_name: str, *, connection: DuckDBPyConnection | None = None) -> DuckDBPyRelation: ... def write_csv( df: pandas.DataFrame, - filename: str, + filename: str | os.PathLike[str], *, sep: str | None = None, na_rep: str | None = None, diff --git a/src/duckdb_python.cpp b/src/duckdb_python.cpp index af0f7abe..84e30330 100644 --- a/src/duckdb_python.cpp +++ b/src/duckdb_python.cpp @@ -866,7 +866,7 @@ static void InitializeConnectionMethods(nb::module_ &m) { nb::arg("connection").none() = nb::none()); m.def( "write_csv", - [](const PandasDataFrame &df, const string &filename, const nb::object &sep = nb::none(), + [](const PandasDataFrame &df, const nb::object &filename, const nb::object &sep = nb::none(), const nb::object &na_rep = nb::none(), const nb::object &header = nb::none(), const nb::object "echar = nb::none(), const nb::object &escapechar = nb::none(), const nb::object &date_format = nb::none(), const nb::object ×tamp_format = nb::none(), diff --git a/src/include/duckdb_python/path_like.hpp b/src/include/duckdb_python/path_like.hpp index aa1a429b..446d2a74 100644 --- a/src/include/duckdb_python/path_like.hpp +++ b/src/include/duckdb_python/path_like.hpp @@ -9,6 +9,9 @@ namespace duckdb { struct DuckDBPyConnection; +bool TryDecodePath(const nb::object &object, string &result); +string PathToString(const nb::object &object); + struct PathLike { static PathLike Create(const nb::object &object, DuckDBPyConnection &connection); // The file(s) extracted from object diff --git a/src/include/duckdb_python/pyrelation.hpp b/src/include/duckdb_python/pyrelation.hpp index f71a6327..0cb96255 100644 --- a/src/include/duckdb_python/pyrelation.hpp +++ b/src/include/duckdb_python/pyrelation.hpp @@ -211,7 +211,7 @@ struct DuckDBPyRelation { std::unique_ptr Join(DuckDBPyRelation *other, const nb::object &condition, const string &type); std::unique_ptr Cross(DuckDBPyRelation *other); - void ToParquet(const string &filename, const nb::object &compression = nb::none(), + void ToParquet(const nb::object &file_name, const nb::object &compression = nb::none(), const nb::object &field_ids = nb::none(), const nb::object &row_group_size_bytes = nb::none(), const nb::object &row_group_size = nb::none(), const nb::object &overwrite = nb::none(), const nb::object &per_thread_output = nb::none(), const nb::object &use_tmp_file = nb::none(), @@ -219,7 +219,7 @@ struct DuckDBPyRelation { const nb::object &append = nb::none(), const nb::object &filename_pattern = nb::none(), const nb::object &file_size_bytes = nb::none()); - void ToCSV(const string &filename, const nb::object &sep = nb::none(), const nb::object &na_rep = nb::none(), + void ToCSV(const nb::object &file_name, const nb::object &sep = nb::none(), const nb::object &na_rep = nb::none(), const nb::object &header = nb::none(), const nb::object "echar = nb::none(), const nb::object &escapechar = nb::none(), const nb::object &date_format = nb::none(), const nb::object ×tamp_format = nb::none(), const nb::object "ing = nb::none(), diff --git a/src/path_like.cpp b/src/path_like.cpp index de823fd4..b036f683 100644 --- a/src/path_like.cpp +++ b/src/path_like.cpp @@ -34,15 +34,33 @@ struct PathLikeProcessor { vector fs_files; }; -void PathLikeProcessor::AddFile(const nb::object &object) { +bool TryDecodePath(const nb::object &object, string &result) { if (nb::isinstance(object)) { - all_files.push_back(nb::cast(nb::str(object))); - return; + result = nb::cast(object); + return true; } if (nb::isinstance(object) || nb::hasattr(object, "__fspath__")) { // A bytes path or an os.PathLike object (e.g. pathlib.Path) - decode it to a string auto fsdecode = nb::module_::import_("os").attr("fsdecode"); - all_files.push_back(nb::cast(nb::str(fsdecode(object)))); + result = nb::cast(fsdecode(object)); + return true; + } + return false; +} + +string PathToString(const nb::object &object) { + string result; + if (!TryDecodePath(object, result)) { + throw InvalidInputException("Expected a str, bytes, or os.PathLike object for the file path, not '%s'", + Py_TYPE(object.ptr())->tp_name); + } + return result; +} + +void PathLikeProcessor::AddFile(const nb::object &object) { + string decoded; + if (TryDecodePath(object, decoded)) { + all_files.push_back(std::move(decoded)); return; } // This is (assumed to be) a file-like object diff --git a/src/pyrelation.cpp b/src/pyrelation.cpp index 632a9f0e..efc1b08c 100644 --- a/src/pyrelation.cpp +++ b/src/pyrelation.cpp @@ -1,5 +1,6 @@ #include "duckdb_python/nb/casters.hpp" #include "duckdb_python/pyrelation.hpp" +#include "duckdb_python/path_like.hpp" #include "duckdb_python/pyconnection/pyconnection.hpp" #include "duckdb_python/pytype.hpp" #include "duckdb_python/pyresult.hpp" @@ -1253,12 +1254,14 @@ static Value NestedDictToStruct(const nb::object &dictionary) { return Value::STRUCT(std::move(children)); } -void DuckDBPyRelation::ToParquet(const string &filename, const nb::object &compression, const nb::object &field_ids, - const nb::object &row_group_size_bytes, const nb::object &row_group_size, - const nb::object &overwrite, const nb::object &per_thread_output, - const nb::object &use_tmp_file, const nb::object &partition_by, - const nb::object &write_partition_columns, const nb::object &append, - const nb::object &filename_pattern, const nb::object &file_size_bytes) { +void DuckDBPyRelation::ToParquet(const nb::object &file_name, const nb::object &compression, + const nb::object &field_ids, const nb::object &row_group_size_bytes, + const nb::object &row_group_size, const nb::object &overwrite, + const nb::object &per_thread_output, const nb::object &use_tmp_file, + const nb::object &partition_by, const nb::object &write_partition_columns, + const nb::object &append, const nb::object &filename_pattern, + const nb::object &file_size_bytes) { + auto filename = PathToString(file_name); case_insensitive_map_t> options; if (!nb::none().is(compression)) { @@ -1371,13 +1374,14 @@ void DuckDBPyRelation::ToParquet(const string &filename, const nb::object &compr PyExecuteRelation(write_parquet); } -void DuckDBPyRelation::ToCSV(const string &filename, const nb::object &sep, const nb::object &na_rep, +void DuckDBPyRelation::ToCSV(const nb::object &file_name, const nb::object &sep, const nb::object &na_rep, const nb::object &header, const nb::object "echar, const nb::object &escapechar, const nb::object &date_format, const nb::object ×tamp_format, const nb::object "ing, const nb::object &encoding, const nb::object &compression, const nb::object &overwrite, const nb::object &per_thread_output, const nb::object &use_tmp_file, const nb::object &partition_by, const nb::object &write_partition_columns) { + auto filename = PathToString(file_name); case_insensitive_map_t> options; if (!nb::none().is(sep)) { diff --git a/tests/fast/api/test_to_csv.py b/tests/fast/api/test_to_csv.py index 9e51e316..7a7b37d6 100644 --- a/tests/fast/api/test_to_csv.py +++ b/tests/fast/api/test_to_csv.py @@ -299,3 +299,16 @@ def test_to_csv_use_tmp_file(self): rel.to_csv(temp_file_name, header=True, use_tmp_file=True) csv_rel = duckdb.read_csv(temp_file_name, header=True) assert rel.execute().fetchall() == csv_rel.execute().fetchall() + + def test_to_csv_pathlib(self, tmp_path): + file_path = tmp_path / "test.csv" # pathlib.Path + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + rel = duckdb.from_df(df) + rel.to_csv(file_path) + assert rel.execute().fetchall() == duckdb.read_csv(file_path).execute().fetchall() + + def test_to_csv_rejects_non_path(self): + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + rel = duckdb.from_df(df) + with pytest.raises(duckdb.InvalidInputException): + rel.to_csv(123) diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py index 71d5e00e..680a5e91 100644 --- a/tests/fast/api/test_to_parquet.py +++ b/tests/fast/api/test_to_parquet.py @@ -117,6 +117,19 @@ def test_use_tmp_file(self): result = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == result.execute().fetchall() + def test_to_parquet_pathlib(self, tmp_path): + file_name = tmp_path / "test.parquet" # pathlib.Path + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + rel = duckdb.from_df(df) + rel.to_parquet(file_name) + assert rel.execute().fetchall() == duckdb.read_parquet(file_name).execute().fetchall() + + def test_to_parquet_rejects_non_path(self): + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + rel = duckdb.from_df(df) + with pytest.raises(duckdb.InvalidInputException): + rel.to_parquet(123) + def test_per_thread_output(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 num_threads = duckdb.sql("select current_setting('threads')").fetchone()[0]