From d4780ce96fbb101cab89b61b4e120cfa21a63158 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Jun 2026 01:18:53 +0000 Subject: [PATCH 1/8] refactor(bigframes): Extract json conversions to distinct ops --- .../ibis_compiler/scalar_op_registry.py | 68 ++++++++++--------- .../bigframes/core/compile/polars/compiler.py | 5 ++ .../bigframes/core/compile/polars/lowering.py | 3 - .../sqlglot/expressions/generic_ops.py | 35 ---------- .../compile/sqlglot/expressions/json_ops.py | 35 +++++++++- packages/bigframes/bigframes/dataframe.py | 11 ++- .../bigframes/operations/__init__.py | 1 + .../bigframes/operations/generic_ops.py | 33 --------- .../bigframes/operations/json_ops.py | 2 + packages/bigframes/bigframes/series.py | 16 ++++- .../system/small/engines/test_generic_ops.py | 16 ++--- 11 files changed, 102 insertions(+), 123 deletions(-) diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 5172d1e7c602..9386e6f25228 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -922,35 +922,6 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): elif to_type == ibis_dtypes.time: return x_converted.time() - if to_type == ibis_dtypes.json: - if x.type() == ibis_dtypes.string: - return parse_json_in_safe(x) if op.safe else parse_json(x) - if x.type() == ibis_dtypes.bool: - x_bool = typing.cast( - ibis_types.StringValue, - bigframes.core.compile.ibis_types.cast_ibis_value( - x, ibis_dtypes.string, safe=op.safe - ), - ).lower() - return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) - if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): - x_str = bigframes.core.compile.ibis_types.cast_ibis_value( - x, ibis_dtypes.string, safe=op.safe - ) - return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) - - if x.type() == ibis_dtypes.json: - if to_type == ibis_dtypes.int64: - return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) - if to_type == ibis_dtypes.float64: - return ( - cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) - ) - if to_type == ibis_dtypes.bool: - return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) - if to_type == ibis_dtypes.string: - return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) - # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) @@ -1193,9 +1164,42 @@ def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): return parse_json(json_str=x) -@scalar_op_compiler.register_unary_op(ops.ToJSON) -def to_json_op_impl(json_obj: ibis_types.Value): - return to_json(json_obj=json_obj) +@scalar_op_compiler.register_unary_op(ops.ToJSON, pass_op=True) +def to_json_op_impl(x: ibis_types.Value, op: ops.ToJSON): + if x.type() == ibis_dtypes.string: + return parse_json_in_safe(x) if op.safe else parse_json(x) + if x.type() == ibis_dtypes.bool: + x_bool = typing.cast( + ibis_types.StringValue, + bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ), + ).lower() + return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) + if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): + x_str = bigframes.core.compile.ibis_types.cast_ibis_value( + x, ibis_dtypes.string, safe=op.safe + ) + return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) + raise TypeError(f"Cannot cast to JSON from type {x.type()}") + + +@scalar_op_compiler.register_unary_op(ops.JSONDecode, pass_op=True) +def json_decode_op_impl(x: ibis_types.Value, op: ops.JSONDecode): + to_type = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + op.to_type + ) + if to_type == ibis_dtypes.int64: + return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) + if to_type == ibis_dtypes.float64: + return ( + cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) + ) + if to_type == ibis_dtypes.bool: + return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) + if to_type == ibis_dtypes.string: + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + raise TypeError(f"Cannot cast from JSON to type {to_type}") @scalar_op_compiler.register_unary_op(ops.ToJSONString) diff --git a/packages/bigframes/bigframes/core/compile/polars/compiler.py b/packages/bigframes/bigframes/core/compile/polars/compiler.py index 6f24929eeb4e..b93b1403c56a 100644 --- a/packages/bigframes/bigframes/core/compile/polars/compiler.py +++ b/packages/bigframes/bigframes/core/compile/polars/compiler.py @@ -482,6 +482,11 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) + @compile_op.register(json_ops.ToJSON) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + # Polars represents JSON as string, so to_json is cast to String + return input.cast(pl.String()) + @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: return pl.concat_list(*inputs) diff --git a/packages/bigframes/bigframes/core/compile/polars/lowering.py b/packages/bigframes/bigframes/core/compile/polars/lowering.py index 7416ebc963b4..a56c44a49071 100644 --- a/packages/bigframes/bigframes/core/compile/polars/lowering.py +++ b/packages/bigframes/bigframes/core/compile/polars/lowering.py @@ -412,9 +412,6 @@ def _coerce_comparables( def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): if arg.output_type == cast_op.to_type: return arg - - if arg.output_type == dtypes.JSON_DTYPE: - return json_ops.JSONDecode(cast_op.to_type).as_expr(arg) if ( arg.output_type == dtypes.STRING_DTYPE and cast_op.to_type == dtypes.DATETIME_DTYPE diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 22dcd8bf51ac..2cc27cb8e5a2 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -36,12 +36,6 @@ def _(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: sg_to_type = sqlglot_types.from_bigframes_dtype(to_type) sg_expr = expr.expr - if to_type == dtypes.JSON_DTYPE: - return _cast_to_json(expr, op) - - if from_type == dtypes.JSON_DTYPE: - return _cast_from_json(expr, op) - if to_type == dtypes.INT_DTYPE: result = _cast_to_int(expr, op) if result is not None: @@ -251,35 +245,6 @@ def _(*values: TypedExpr) -> sge.Expression: # Helper functions -def _cast_to_json(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: - from_type = expr.dtype - sg_expr = expr.expr - - if from_type == dtypes.STRING_DTYPE: - func_name = "SAFE.PARSE_JSON" if op.safe else "PARSE_JSON" - return sge.func(func_name, sg_expr) - if from_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE, dtypes.FLOAT_DTYPE): - sg_expr = sge.Cast(this=sg_expr, to="STRING") - return sge.func("PARSE_JSON", sg_expr) - raise TypeError(f"Cannot cast from {from_type} to {dtypes.JSON_DTYPE}") - - -def _cast_from_json(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: - to_type = op.to_type - sg_expr = expr.expr - func_name = "" - if to_type == dtypes.INT_DTYPE: - func_name = "INT64" - elif to_type == dtypes.FLOAT_DTYPE: - func_name = "FLOAT64" - elif to_type == dtypes.BOOL_DTYPE: - func_name = "BOOL" - elif to_type == dtypes.STRING_DTYPE: - func_name = "STRING" - if func_name: - func_name = "SAFE." + func_name if op.safe else func_name - return sge.func(func_name, sg_expr) - raise TypeError(f"Cannot cast from {dtypes.JSON_DTYPE} to {to_type}") def _cast_to_int(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression | None: diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py index f27b1f138d70..6ef0940306b9 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py @@ -17,6 +17,7 @@ import bigframes_vendored.sqlglot.expressions as sge import bigframes.core.compile.sqlglot.expression_compiler as expression_compiler +from bigframes import dtypes from bigframes import operations as ops from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr @@ -69,9 +70,37 @@ def _(expr: TypedExpr) -> sge.Expression: return sge.func("PARSE_JSON", expr.expr) -@register_unary_op(ops.ToJSON) -def _(expr: TypedExpr) -> sge.Expression: - return sge.func("TO_JSON", expr.expr) +@register_unary_op(ops.ToJSON, pass_op=True) +def _(expr: TypedExpr, op: ops.ToJSON) -> sge.Expression: + from_type = expr.dtype + sg_expr = expr.expr + + if from_type == dtypes.STRING_DTYPE: + func_name = "SAFE.PARSE_JSON" if op.safe else "PARSE_JSON" + return sge.func(func_name, sg_expr) + if from_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE, dtypes.FLOAT_DTYPE): + sg_expr = sge.Cast(this=sg_expr, to="STRING") + return sge.func("PARSE_JSON", sg_expr) + raise TypeError(f"Cannot cast from {from_type} to {dtypes.JSON_DTYPE}") + + +@register_unary_op(ops.JSONDecode, pass_op=True) +def _(expr: TypedExpr, op: ops.JSONDecode) -> sge.Expression: + to_type = op.to_type + sg_expr = expr.expr + func_name = "" + if to_type == dtypes.INT_DTYPE: + func_name = "INT64" + elif to_type == dtypes.FLOAT_DTYPE: + func_name = "FLOAT64" + elif to_type == dtypes.BOOL_DTYPE: + func_name = "BOOL" + elif to_type == dtypes.STRING_DTYPE: + func_name = "STRING" + if func_name: + func_name = "SAFE." + func_name if op.safe else func_name + return sge.func(func_name, sg_expr) + raise TypeError(f"Cannot cast from {dtypes.JSON_DTYPE} to {to_type}") @register_unary_op(ops.ToJSONString) diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index f5fc7bdfc6b1..2e694e09d64f 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -442,17 +442,16 @@ def astype( if errors not in ["raise", "null"]: raise ValueError("Arg 'error' must be one of 'raise' or 'null'") - safe_cast = errors == "null" - if isinstance(dtype, dict): result = self.copy() for col, to_type in dtype.items(): - result[col] = result[col].astype(to_type) + result[col] = result[col].astype(to_type, errors=errors) return result - dtype = bigframes.dtypes.bigframes_type(dtype) - - return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) + result = self.copy() + for col in result.columns: + result[col] = result[col].astype(dtype, errors=errors) + return result def _should_sql_have_index(self) -> bool: """Should the SQL we pass to BQML and other I/O include the index?""" diff --git a/packages/bigframes/bigframes/operations/__init__.py b/packages/bigframes/bigframes/operations/__init__.py index b8d860029a0f..d6a6193c7579 100644 --- a/packages/bigframes/bigframes/operations/__init__.py +++ b/packages/bigframes/bigframes/operations/__init__.py @@ -128,6 +128,7 @@ ) from bigframes.operations.googlesql import GoogleSqlScalarOp from bigframes.operations.json_ops import ( + JSONDecode, JSONExtract, JSONExtractArray, JSONExtractStringArray, diff --git a/packages/bigframes/bigframes/operations/generic_ops.py b/packages/bigframes/bigframes/operations/generic_ops.py index 9a58f4b8ef33..99cda5fc095f 100644 --- a/packages/bigframes/bigframes/operations/generic_ops.py +++ b/packages/bigframes/bigframes/operations/generic_ops.py @@ -93,10 +93,6 @@ dtypes.STRING_DTYPE, dtypes.INT_DTYPE, ), - ( - dtypes.JSON_DTYPE, - dtypes.INT_DTYPE, - ), # Float casts ( dtypes.BOOL_DTYPE, @@ -118,10 +114,6 @@ dtypes.STRING_DTYPE, dtypes.FLOAT_DTYPE, ), - ( - dtypes.JSON_DTYPE, - dtypes.FLOAT_DTYPE, - ), # Bool casts ( dtypes.INT_DTYPE, @@ -131,10 +123,6 @@ dtypes.FLOAT_DTYPE, dtypes.BOOL_DTYPE, ), - ( - dtypes.JSON_DTYPE, - dtypes.BOOL_DTYPE, - ), # String casts ( dtypes.BYTES_DTYPE, @@ -168,10 +156,6 @@ dtypes.DATE_DTYPE, dtypes.STRING_DTYPE, ), - ( - dtypes.JSON_DTYPE, - dtypes.STRING_DTYPE, - ), # bytes casts ( dtypes.STRING_DTYPE, @@ -276,23 +260,6 @@ dtypes.INT_DTYPE, dtypes.TIMEDELTA_DTYPE, ), - # json casts - ( - dtypes.BOOL_DTYPE, - dtypes.JSON_DTYPE, - ), - ( - dtypes.FLOAT_DTYPE, - dtypes.JSON_DTYPE, - ), - ( - dtypes.STRING_DTYPE, - dtypes.JSON_DTYPE, - ), - ( - dtypes.INT_DTYPE, - dtypes.JSON_DTYPE, - ), ) ) diff --git a/packages/bigframes/bigframes/operations/json_ops.py b/packages/bigframes/bigframes/operations/json_ops.py index 7260a7922305..4aaaa43eac9b 100644 --- a/packages/bigframes/bigframes/operations/json_ops.py +++ b/packages/bigframes/bigframes/operations/json_ops.py @@ -105,6 +105,7 @@ def output_type(self, *input_types): @dataclasses.dataclass(frozen=True) class ToJSON(base_ops.UnaryOp): name: typing.ClassVar[str] = "to_json" + safe: bool = True def output_type(self, *input_types): input_type = input_types[0] @@ -220,6 +221,7 @@ def output_type(self, *input_types): class JSONDecode(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_decode" to_type: dtypes.Dtype + safe: bool = True def output_type(self, *input_types): input_type = input_types[0] diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index 262e1859ab92..7841fc705c24 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -646,9 +646,19 @@ def astype( if errors not in ["raise", "null"]: raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") dtype = bigframes.dtypes.bigframes_type(dtype) - return self._apply_unary_op( - bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) - ) + safe = errors == "null" + if dtype == bigframes.dtypes.JSON_DTYPE: + return self._apply_unary_op( + bigframes.operations.ToJSON(safe=safe) + ) + elif self.dtype == bigframes.dtypes.JSON_DTYPE: + return self._apply_unary_op( + bigframes.operations.JSONDecode(to_type=dtype, safe=safe) + ) + else: + return self._apply_unary_op( + bigframes.operations.AsTypeOp(to_type=dtype, safe=safe) + ) def to_pandas( self, diff --git a/packages/bigframes/tests/system/small/engines/test_generic_ops.py b/packages/bigframes/tests/system/small/engines/test_generic_ops.py index 22ad1bfefa4e..6237caf14bad 100644 --- a/packages/bigframes/tests/system/small/engines/test_generic_ops.py +++ b/packages/bigframes/tests/system/small/engines/test_generic_ops.py @@ -263,16 +263,16 @@ def test_engines_astype_time(scalars_array_value: array_value.ArrayValue, engine @pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_from_json(scalars_array_value: array_value.ArrayValue, engine): exprs = [ - ops.AsTypeOp(to_type=bigframes.dtypes.INT_DTYPE).as_expr( + ops.JSONDecode(to_type=bigframes.dtypes.INT_DTYPE).as_expr( expression.const("5", bigframes.dtypes.JSON_DTYPE) ), - ops.AsTypeOp(to_type=bigframes.dtypes.FLOAT_DTYPE).as_expr( + ops.JSONDecode(to_type=bigframes.dtypes.FLOAT_DTYPE).as_expr( expression.const("5", bigframes.dtypes.JSON_DTYPE) ), - ops.AsTypeOp(to_type=bigframes.dtypes.BOOL_DTYPE).as_expr( + ops.JSONDecode(to_type=bigframes.dtypes.BOOL_DTYPE).as_expr( expression.const("true", bigframes.dtypes.JSON_DTYPE) ), - ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr( + ops.JSONDecode(to_type=bigframes.dtypes.STRING_DTYPE).as_expr( expression.const('"hello world"', bigframes.dtypes.JSON_DTYPE) ), ] @@ -284,17 +284,17 @@ def test_engines_astype_from_json(scalars_array_value: array_value.ArrayValue, e @pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_to_json(scalars_array_value: array_value.ArrayValue, engine): exprs = [ - ops.AsTypeOp(to_type=bigframes.dtypes.JSON_DTYPE).as_expr( + ops.ToJSON().as_expr( expression.deref("int64_col") ), - ops.AsTypeOp(to_type=bigframes.dtypes.JSON_DTYPE).as_expr( + ops.ToJSON().as_expr( # Use a const since float to json has precision issues expression.const(5.2, bigframes.dtypes.FLOAT_DTYPE) ), - ops.AsTypeOp(to_type=bigframes.dtypes.JSON_DTYPE).as_expr( + ops.ToJSON().as_expr( expression.deref("bool_col") ), - ops.AsTypeOp(to_type=bigframes.dtypes.JSON_DTYPE).as_expr( + ops.ToJSON().as_expr( # Use a const since "str_col" has special chars. expression.const('"hello world"', bigframes.dtypes.STRING_DTYPE) ), From df95b139e6b5124152a6e09ec9038a590cb10dba Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Jun 2026 17:46:02 +0000 Subject: [PATCH 2/8] tests and ruff --- .../ibis_compiler/scalar_op_registry.py | 4 +- .../bigframes/core/compile/polars/lowering.py | 1 - .../bigframes/operations/__init__.py | 1 + packages/bigframes/bigframes/series.py | 4 +- .../system/small/engines/test_generic_ops.py | 8 +--- .../test_astype_from_json/out.sql | 8 ++-- .../test_generic_ops/test_to_json/out.sql | 8 ++++ .../test_json_ops/test_to_json/out.sql | 2 +- .../sqlglot/expressions/test_generic_ops.py | 42 ++++++++----------- 9 files changed, 35 insertions(+), 43 deletions(-) create mode 100644 packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 9386e6f25228..2dfcd46ebc22 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1192,9 +1192,7 @@ def json_decode_op_impl(x: ibis_types.Value, op: ops.JSONDecode): if to_type == ibis_dtypes.int64: return cast_json_to_int64_in_safe(x) if op.safe else cast_json_to_int64(x) if to_type == ibis_dtypes.float64: - return ( - cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) - ) + return cast_json_to_float64_in_safe(x) if op.safe else cast_json_to_float64(x) if to_type == ibis_dtypes.bool: return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) if to_type == ibis_dtypes.string: diff --git a/packages/bigframes/bigframes/core/compile/polars/lowering.py b/packages/bigframes/bigframes/core/compile/polars/lowering.py index a56c44a49071..5b3d9154b731 100644 --- a/packages/bigframes/bigframes/core/compile/polars/lowering.py +++ b/packages/bigframes/bigframes/core/compile/polars/lowering.py @@ -26,7 +26,6 @@ comparison_ops, datetime_ops, generic_ops, - json_ops, numeric_ops, string_ops, ) diff --git a/packages/bigframes/bigframes/operations/__init__.py b/packages/bigframes/bigframes/operations/__init__.py index d6a6193c7579..a493e7a755bf 100644 --- a/packages/bigframes/bigframes/operations/__init__.py +++ b/packages/bigframes/bigframes/operations/__init__.py @@ -383,6 +383,7 @@ "FloorDtOp", "IntegerLabelToDatetimeOp", # JSON ops + "JSONDecode", "JSONExtract", "JSONExtractArray", "JSONExtractStringArray", diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index 7841fc705c24..57f74136548c 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -648,9 +648,7 @@ def astype( dtype = bigframes.dtypes.bigframes_type(dtype) safe = errors == "null" if dtype == bigframes.dtypes.JSON_DTYPE: - return self._apply_unary_op( - bigframes.operations.ToJSON(safe=safe) - ) + return self._apply_unary_op(bigframes.operations.ToJSON(safe=safe)) elif self.dtype == bigframes.dtypes.JSON_DTYPE: return self._apply_unary_op( bigframes.operations.JSONDecode(to_type=dtype, safe=safe) diff --git a/packages/bigframes/tests/system/small/engines/test_generic_ops.py b/packages/bigframes/tests/system/small/engines/test_generic_ops.py index 6237caf14bad..05739a1c1b63 100644 --- a/packages/bigframes/tests/system/small/engines/test_generic_ops.py +++ b/packages/bigframes/tests/system/small/engines/test_generic_ops.py @@ -284,16 +284,12 @@ def test_engines_astype_from_json(scalars_array_value: array_value.ArrayValue, e @pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_astype_to_json(scalars_array_value: array_value.ArrayValue, engine): exprs = [ - ops.ToJSON().as_expr( - expression.deref("int64_col") - ), + ops.ToJSON().as_expr(expression.deref("int64_col")), ops.ToJSON().as_expr( # Use a const since float to json has precision issues expression.const(5.2, bigframes.dtypes.FLOAT_DTYPE) ), - ops.ToJSON().as_expr( - expression.deref("bool_col") - ), + ops.ToJSON().as_expr(expression.deref("bool_col")), ops.ToJSON().as_expr( # Use a const since "str_col" has special chars. expression.const('"hello world"', bigframes.dtypes.STRING_DTYPE) diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql index 4603f503b5e0..c9450a928003 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_astype_from_json/out.sql @@ -1,7 +1,7 @@ SELECT - INT64(`json_col`) AS `int64_col`, - FLOAT64(`json_col`) AS `float64_col`, - BOOL(`json_col`) AS `bool_col`, - STRING(`json_col`) AS `string_col`, + SAFE.INT64(`json_col`) AS `int64_col`, + SAFE.FLOAT64(`json_col`) AS `float64_col`, + SAFE.BOOL(`json_col`) AS `bool_col`, + SAFE.STRING(`json_col`) AS `string_col`, SAFE.INT64(`json_col`) AS `int64_w_safe` FROM `bigframes-dev`.`sqlglot_test`.`json_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql new file mode 100644 index 000000000000..bb5c70a6e1a3 --- /dev/null +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql @@ -0,0 +1,8 @@ +SELECT + PARSE_JSON(CAST(`int64_col` AS STRING)) AS `int64_col`, + PARSE_JSON(CAST(`float64_col` AS STRING)) AS `float64_col`, + PARSE_JSON(CAST(`bool_col` AS STRING)) AS `bool_col`, + SAFE.PARSE_JSON(`string_col`) AS `string_col`, + PARSE_JSON(CAST(`bool_col` AS STRING)) AS `bool_w_safe`, + SAFE.PARSE_JSON(`string_col`) AS `string_w_safe` +FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_to_json/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_to_json/out.sql index ef89efa653b1..0545577e27f3 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_to_json/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_json_ops/test_to_json/out.sql @@ -1,3 +1,3 @@ SELECT - TO_JSON(`string_col`) AS `string_col` + SAFE.PARSE_JSON(`string_col`) AS `string_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index 185a8df04509..01aaf7bf19e3 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -110,20 +110,16 @@ def test_astype_string(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql + "\n", "out.sql") -def test_astype_json(scalar_types_df: bpd.DataFrame, snapshot): +def test_to_json(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df ops_map = { - "int64_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("int64_col"), - "float64_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("float64_col"), - "bool_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("bool_col"), - "string_col": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr("string_col"), - "bool_w_safe": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE, safe=True).as_expr( - "bool_col" - ), - "string_w_safe": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE, safe=True).as_expr( - "string_col" - ), + "int64_col": ops.ToJSON().as_expr("int64_col"), + "float64_col": ops.ToJSON().as_expr("float64_col"), + "bool_col": ops.ToJSON().as_expr("bool_col"), + "string_col": ops.ToJSON().as_expr("string_col"), + "bool_w_safe": ops.ToJSON(safe=True).as_expr("bool_col"), + "string_w_safe": ops.ToJSON(safe=True).as_expr("string_col"), } sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") @@ -133,11 +129,11 @@ def test_astype_from_json(json_types_df: bpd.DataFrame, snapshot): bf_df = json_types_df ops_map = { - "int64_col": ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr("json_col"), - "float64_col": ops.AsTypeOp(to_type=dtypes.FLOAT_DTYPE).as_expr("json_col"), - "bool_col": ops.AsTypeOp(to_type=dtypes.BOOL_DTYPE).as_expr("json_col"), - "string_col": ops.AsTypeOp(to_type=dtypes.STRING_DTYPE).as_expr("json_col"), - "int64_w_safe": ops.AsTypeOp(to_type=dtypes.INT_DTYPE, safe=True).as_expr( + "int64_col": ops.JSONDecode(to_type=dtypes.INT_DTYPE).as_expr("json_col"), + "float64_col": ops.JSONDecode(to_type=dtypes.FLOAT_DTYPE).as_expr("json_col"), + "bool_col": ops.JSONDecode(to_type=dtypes.BOOL_DTYPE).as_expr("json_col"), + "string_col": ops.JSONDecode(to_type=dtypes.STRING_DTYPE).as_expr("json_col"), + "int64_w_safe": ops.JSONDecode(to_type=dtypes.INT_DTYPE, safe=True).as_expr( "json_col" ), } @@ -145,24 +141,20 @@ def test_astype_from_json(json_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") -def test_astype_json_invalid( - scalar_types_df: bpd.DataFrame, json_types_df: bpd.DataFrame -): +def test_tojson_invalid(scalar_types_df: bpd.DataFrame, json_types_df: bpd.DataFrame): # Test invalid cast to JSON - with pytest.raises(TypeError, match="Cannot cast timestamp.* to .*json.*"): + with pytest.raises(TypeError, match="Cannot cast"): ops_map_to = { - "datetime_to_json": ops.AsTypeOp(to_type=dtypes.JSON_DTYPE).as_expr( - "datetime_col" - ), + "datetime_to_json": ops.ToJSON().as_expr("datetime_col"), } utils._apply_ops_to_sql( scalar_types_df, list(ops_map_to.values()), list(ops_map_to.keys()) ) # Test invalid cast from JSON - with pytest.raises(TypeError, match="Cannot cast .*json.* to timestamp.*"): + with pytest.raises(TypeError, match="Cannot cast"): ops_map_from = { - "json_to_datetime": ops.AsTypeOp(to_type=dtypes.DATETIME_DTYPE).as_expr( + "json_to_datetime": ops.JSONDecode(to_type=dtypes.DATETIME_DTYPE).as_expr( "json_col" ), } From 0387109b17fd1fb68a2471b3d9825286986bccd3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Jun 2026 18:38:27 +0000 Subject: [PATCH 3/8] fix system tests --- .../compile/ibis_compiler/scalar_op_registry.py | 15 +-------------- .../bigframes/core/compile/polars/compiler.py | 9 +++++++-- .../core/compile/sqlglot/expressions/json_ops.py | 6 ++---- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 2dfcd46ebc22..06803b4990bd 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1168,20 +1168,7 @@ def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): def to_json_op_impl(x: ibis_types.Value, op: ops.ToJSON): if x.type() == ibis_dtypes.string: return parse_json_in_safe(x) if op.safe else parse_json(x) - if x.type() == ibis_dtypes.bool: - x_bool = typing.cast( - ibis_types.StringValue, - bigframes.core.compile.ibis_types.cast_ibis_value( - x, ibis_dtypes.string, safe=op.safe - ), - ).lower() - return parse_json_in_safe(x_bool) if op.safe else parse_json(x_bool) - if x.type() in (ibis_dtypes.int64, ibis_dtypes.float64): - x_str = bigframes.core.compile.ibis_types.cast_ibis_value( - x, ibis_dtypes.string, safe=op.safe - ) - return parse_json_in_safe(x_str) if op.safe else parse_json(x_str) - raise TypeError(f"Cannot cast to JSON from type {x.type()}") + return to_json(x) @scalar_op_compiler.register_unary_op(ops.JSONDecode, pass_op=True) diff --git a/packages/bigframes/bigframes/core/compile/polars/compiler.py b/packages/bigframes/bigframes/core/compile/polars/compiler.py index b93b1403c56a..221c2eca8bc1 100644 --- a/packages/bigframes/bigframes/core/compile/polars/compiler.py +++ b/packages/bigframes/bigframes/core/compile/polars/compiler.py @@ -484,8 +484,13 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.ToJSON) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - # Polars represents JSON as string, so to_json is cast to String - return input.cast(pl.String()) + # Polars represents JSON as string, so to_json is cast to String. + # Handle null values by mapping them to JSON 'null' representation. + return ( + pl.when(input.is_null()) + .then(pl.lit("null")) + .otherwise(input.cast(pl.String())) + ) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py index 6ef0940306b9..ccf17a51fe26 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py @@ -78,10 +78,8 @@ def _(expr: TypedExpr, op: ops.ToJSON) -> sge.Expression: if from_type == dtypes.STRING_DTYPE: func_name = "SAFE.PARSE_JSON" if op.safe else "PARSE_JSON" return sge.func(func_name, sg_expr) - if from_type in (dtypes.INT_DTYPE, dtypes.BOOL_DTYPE, dtypes.FLOAT_DTYPE): - sg_expr = sge.Cast(this=sg_expr, to="STRING") - return sge.func("PARSE_JSON", sg_expr) - raise TypeError(f"Cannot cast from {from_type} to {dtypes.JSON_DTYPE}") + else: + return sge.func("TO_JSON", sg_expr) @register_unary_op(ops.JSONDecode, pass_op=True) From ddd78e0880d25dbf156d003400e9cf1351040b87 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Jun 2026 18:56:03 +0000 Subject: [PATCH 4/8] fix unit tests --- packages/bigframes/bigframes/dtypes.py | 24 +++++++++++++++++-- .../bigframes/operations/json_ops.py | 2 +- .../test_generic_ops/test_to_json/out.sql | 8 +++---- .../sqlglot/expressions/test_generic_ops.py | 4 ++-- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/packages/bigframes/bigframes/dtypes.py b/packages/bigframes/bigframes/dtypes.py index e7539c59c7d7..51ee96432390 100644 --- a/packages/bigframes/bigframes/dtypes.py +++ b/packages/bigframes/bigframes/dtypes.py @@ -364,10 +364,30 @@ def is_json_like(type_: ExpressionType) -> bool: return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string -def is_json_encoding_type(type_: ExpressionType) -> bool: +def is_json_encoding_type(type_: ExpressionType, strict: bool = False) -> bool: # Types can be converted into JSON. # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_encodings - return type_ != GEO_DTYPE + if is_array_like(type_): + return is_json_encoding_type(get_array_inner_type(type_), strict=strict) + if is_struct_like(type_): + return all( + is_json_encoding_type(field_type, strict=strict) + for field_type in get_struct_fields(type_).values() + ) + + if strict: + # Strict are the types (mostly) defined by json spec, with no/minimal + # encoding/decoding involved. So no temporal types. + return type_ in ( + INT_DTYPE, + FLOAT_DTYPE, + BOOL_DTYPE, + STRING_DTYPE, + JSON_DTYPE, + ) + else: + # GoogleSQL implementation handles anything but GEO + return type_ != GEO_DTYPE def is_numeric(type_: ExpressionType, include_bool: bool = True) -> bool: diff --git a/packages/bigframes/bigframes/operations/json_ops.py b/packages/bigframes/bigframes/operations/json_ops.py index 4aaaa43eac9b..fc39d4a59b61 100644 --- a/packages/bigframes/bigframes/operations/json_ops.py +++ b/packages/bigframes/bigframes/operations/json_ops.py @@ -109,7 +109,7 @@ class ToJSON(base_ops.UnaryOp): def output_type(self, *input_types): input_type = input_types[0] - if not dtypes.is_json_encoding_type(input_type): + if not dtypes.is_json_encoding_type(input_type, strict=True): raise TypeError( "The value to be assigned must be a type that can be encoded as JSON." + f"Received type: {input_type}" diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql index bb5c70a6e1a3..9e03d8be87b9 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql @@ -1,8 +1,8 @@ SELECT - PARSE_JSON(CAST(`int64_col` AS STRING)) AS `int64_col`, - PARSE_JSON(CAST(`float64_col` AS STRING)) AS `float64_col`, - PARSE_JSON(CAST(`bool_col` AS STRING)) AS `bool_col`, + TO_JSON(`int64_col`) AS `int64_col`, + TO_JSON(`float64_col`) AS `float64_col`, + TO_JSON(`bool_col`) AS `bool_col`, SAFE.PARSE_JSON(`string_col`) AS `string_col`, - PARSE_JSON(CAST(`bool_col` AS STRING)) AS `bool_w_safe`, + TO_JSON(`bool_col`) AS `bool_w_safe`, SAFE.PARSE_JSON(`string_col`) AS `string_w_safe` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py index 01aaf7bf19e3..e3669e1b0edc 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/test_generic_ops.py @@ -143,7 +143,7 @@ def test_astype_from_json(json_types_df: bpd.DataFrame, snapshot): def test_tojson_invalid(scalar_types_df: bpd.DataFrame, json_types_df: bpd.DataFrame): # Test invalid cast to JSON - with pytest.raises(TypeError, match="Cannot cast"): + with pytest.raises(TypeError): ops_map_to = { "datetime_to_json": ops.ToJSON().as_expr("datetime_col"), } @@ -152,7 +152,7 @@ def test_tojson_invalid(scalar_types_df: bpd.DataFrame, json_types_df: bpd.DataF ) # Test invalid cast from JSON - with pytest.raises(TypeError, match="Cannot cast"): + with pytest.raises(TypeError): ops_map_from = { "json_to_datetime": ops.JSONDecode(to_type=dtypes.DATETIME_DTYPE).as_expr( "json_col" From e77ac9a7287b379da1a4259b2289fc6367baae4c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Jun 2026 19:13:56 +0000 Subject: [PATCH 5/8] more fixes --- .../bigframes/core/compile/polars/compiler.py | 38 +++++++++++++------ .../bigframes/operations/json_ops.py | 7 ++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/packages/bigframes/bigframes/core/compile/polars/compiler.py b/packages/bigframes/bigframes/core/compile/polars/compiler.py index 221c2eca8bc1..1a486e24e96f 100644 --- a/packages/bigframes/bigframes/core/compile/polars/compiler.py +++ b/packages/bigframes/bigframes/core/compile/polars/compiler.py @@ -138,11 +138,20 @@ class PolarsExpressionCompiler: Should be extended to dispatch based on bigframes schema types. """ - @functools.singledispatchmethod + _expr_types: dict[int, bigframes.dtypes.ExpressionType] = dataclasses.field( + default_factory=dict, init=False, compare=False + ) + def compile_expression(self, expression: ex.Expression) -> pl.Expr: + res = self._compile_expression(expression) + self._expr_types[id(res)] = expression.output_type + return res + + @functools.singledispatchmethod + def _compile_expression(self, expression: ex.Expression) -> pl.Expr: raise NotImplementedError(f"Cannot compile expression: {expression}") - @compile_expression.register + @_compile_expression.register def _( self, expression: ex.ScalarConstantExpression, @@ -159,21 +168,21 @@ def _( return pl.lit(value, _bigframes_dtype_to_polars_dtype(expression.dtype)) - @compile_expression.register + @_compile_expression.register def _( self, expression: ex.DerefOp, ) -> pl.Expr: return pl.col(expression.id.sql) - @compile_expression.register + @_compile_expression.register def _( self, expression: ex.ResolvedDerefOp, ) -> pl.Expr: return pl.col(expression.id.sql) - @compile_expression.register + @_compile_expression.register def _( self, expression: ex.OpExpression, @@ -484,13 +493,18 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.ToJSON) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - # Polars represents JSON as string, so to_json is cast to String. - # Handle null values by mapping them to JSON 'null' representation. - return ( - pl.when(input.is_null()) - .then(pl.lit("null")) - .otherwise(input.cast(pl.String())) - ) + from_type = self._expr_types.get(id(input)) + if from_type in ( + bigframes.dtypes.STRING_DTYPE, + bigframes.dtypes.JSON_DTYPE, + ): + return input + else: + return ( + pl.when(input.is_null()) + .then(pl.lit("null")) + .otherwise(input.cast(pl.String())) + ) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/packages/bigframes/bigframes/operations/json_ops.py b/packages/bigframes/bigframes/operations/json_ops.py index fc39d4a59b61..c9b5849f9ed1 100644 --- a/packages/bigframes/bigframes/operations/json_ops.py +++ b/packages/bigframes/bigframes/operations/json_ops.py @@ -230,4 +230,11 @@ def output_type(self, *input_types): "Input type must be a valid JSON object or JSON-formatted string type." + f" Received type: {input_type}" ) + if self.to_type not in ( + dtypes.INT_DTYPE, + dtypes.FLOAT_DTYPE, + dtypes.BOOL_DTYPE, + dtypes.STRING_DTYPE, + ): + raise TypeError(f"Cannot cast from {dtypes.JSON_DTYPE} to {self.to_type}") return self.to_type From 3d85d6ef0e2d3ddf3dca3806047a1afbd3b855a1 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Jun 2026 01:11:52 +0000 Subject: [PATCH 6/8] fixes --- .../ibis_compiler/scalar_op_registry.py | 2 +- .../bigframes/core/compile/polars/compiler.py | 10 ++--- .../compile/sqlglot/expressions/json_ops.py | 6 ++- packages/bigframes/bigframes/dataframe.py | 40 +++++++++++++++---- .../tests/system/small/bigquery/test_json.py | 2 +- .../tests/system/small/test_series.py | 13 +++--- .../test_generic_ops/test_to_json/out.sql | 8 ++-- .../pandas/core/config_init.py | 2 +- 8 files changed, 55 insertions(+), 28 deletions(-) diff --git a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 06803b4990bd..3f9fcb5b75df 100644 --- a/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/packages/bigframes/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1168,7 +1168,7 @@ def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): def to_json_op_impl(x: ibis_types.Value, op: ops.ToJSON): if x.type() == ibis_dtypes.string: return parse_json_in_safe(x) if op.safe else parse_json(x) - return to_json(x) + return x.isnull().ifelse(ibis.null().cast(ibis_dtypes.json), to_json(x)) @scalar_op_compiler.register_unary_op(ops.JSONDecode, pass_op=True) diff --git a/packages/bigframes/bigframes/core/compile/polars/compiler.py b/packages/bigframes/bigframes/core/compile/polars/compiler.py index 1a486e24e96f..2477f27b6432 100644 --- a/packages/bigframes/bigframes/core/compile/polars/compiler.py +++ b/packages/bigframes/bigframes/core/compile/polars/compiler.py @@ -487,12 +487,12 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: ) @compile_op.register(json_ops.JSONDecode) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + def _(self, op: json_ops.JSONDecode, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) @compile_op.register(json_ops.ToJSON) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + def _(self, op: json_ops.ToJSON, input: pl.Expr) -> pl.Expr: from_type = self._expr_types.get(id(input)) if from_type in ( bigframes.dtypes.STRING_DTYPE, @@ -500,11 +500,7 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: ): return input else: - return ( - pl.when(input.is_null()) - .then(pl.lit("null")) - .otherwise(input.cast(pl.String())) - ) + return input.cast(pl.String()) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py index ccf17a51fe26..f9a92d3d7a6d 100644 --- a/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py +++ b/packages/bigframes/bigframes/core/compile/sqlglot/expressions/json_ops.py @@ -75,11 +75,15 @@ def _(expr: TypedExpr, op: ops.ToJSON) -> sge.Expression: from_type = expr.dtype sg_expr = expr.expr + # Parsing really should be a distinct operation from serialization, but + # this was the way things were intially launched. if from_type == dtypes.STRING_DTYPE: func_name = "SAFE.PARSE_JSON" if op.safe else "PARSE_JSON" return sge.func(func_name, sg_expr) else: - return sge.func("TO_JSON", sg_expr) + return sge.func( + "IF", sg_expr.is_(sge.Null()), sge.Null(), sge.func("TO_JSON", sg_expr) + ) @register_unary_op(ops.JSONDecode, pass_op=True) diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index 2e694e09d64f..2d857ecee9e0 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -443,15 +443,39 @@ def astype( raise ValueError("Arg 'error' must be one of 'raise' or 'null'") if isinstance(dtype, dict): - result = self.copy() - for col, to_type in dtype.items(): - result[col] = result[col].astype(to_type, errors=errors) - return result + for col in dtype: + if col not in self.columns: + raise KeyError( + f"Only Column Names are allowed in dtypes dict. '{col}' is not in the columns." + ) - result = self.copy() - for col in result.columns: - result[col] = result[col].astype(dtype, errors=errors) - return result + safe_cast = errors == "null" + + exprs = [] + for col_id, col_label in zip( + self._block.value_columns, self._block.column_labels + ): + from_type = self._block._column_type(col_id) + + if isinstance(dtype, dict): + if col_label not in dtype: + exprs.append(ex.deref(col_id)) + continue + to_type = bigframes.dtypes.bigframes_type(dtype[col_label]) + else: + to_type = bigframes.dtypes.bigframes_type(dtype) + + if to_type == bigframes.dtypes.JSON_DTYPE: + op = ops.ToJSON(safe=safe_cast) + elif from_type == bigframes.dtypes.JSON_DTYPE: + op = ops.JSONDecode(to_type=to_type, safe=safe_cast) + else: + op = ops.AsTypeOp(to_type=to_type, safe=safe_cast) + + exprs.append(op.as_expr(ex.deref(col_id))) + + block = self._block.project_exprs(exprs, labels=self.columns, drop=True) + return DataFrame(block) def _should_sql_have_index(self) -> bool: """Should the SQL we pass to BQML and other I/O include the index?""" diff --git a/packages/bigframes/tests/system/small/bigquery/test_json.py b/packages/bigframes/tests/system/small/bigquery/test_json.py index 4fc4d2283ece..2d97172e7b5c 100644 --- a/packages/bigframes/tests/system/small/bigquery/test_json.py +++ b/packages/bigframes/tests/system/small/bigquery/test_json.py @@ -390,7 +390,7 @@ def test_parse_json_w_invalid_series_type(): def test_to_json_from_int(): s = bpd.Series([1, 2, None, 3]) actual = bbq.to_json(s) - expected = bpd.Series(["1.0", "2.0", "null", "3.0"], dtype=dtypes.JSON_DTYPE) + expected = bpd.Series(["1.0", "2.0", None, "3.0"], dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) diff --git a/packages/bigframes/tests/system/small/test_series.py b/packages/bigframes/tests/system/small/test_series.py index 5df88e930432..2e80b75c0b41 100644 --- a/packages/bigframes/tests/system/small/test_series.py +++ b/packages/bigframes/tests/system/small/test_series.py @@ -4019,25 +4019,28 @@ def test_timestamp_astype_string(session): @pytest.mark.parametrize("errors", ["raise", "null"]) def test_float_astype_json(errors, session): - data = ["1.25", "2500000000", None, "-12323.24"] + data = ["1.25", "2500000000.1", None, "-12323.24"] bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) assert bf_result.dtype == dtypes.JSON_DTYPE + bf_result_pandas = bf_result.to_pandas() - expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_data = [float(x) if x is not None else None for x in data] + expected_result = pd.Series(expected_data, dtype=dtypes.JSON_DTYPE) expected_result.index = expected_result.index.astype("Int64") - bigframes.testing.utils.assert_series_equal(bf_result.to_pandas(), expected_result) + bigframes.testing.utils.assert_series_equal(bf_result_pandas, expected_result) def test_float_astype_json_str(session): - data = ["1.25", "2500000000", None, "-12323.24"] + data = ["1.25", "2500000000.1", None, "-12323.24"] bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE, session=session) bf_result = bf_series.astype("json") assert bf_result.dtype == dtypes.JSON_DTYPE - expected_result = pd.Series(data, dtype=dtypes.JSON_DTYPE) + expected_data = [float(x) if x is not None else None for x in data] + expected_result = pd.Series(expected_data, dtype=dtypes.JSON_DTYPE) expected_result.index = expected_result.index.astype("Int64") bigframes.testing.utils.assert_series_equal(bf_result.to_pandas(), expected_result) diff --git a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql index 9e03d8be87b9..86d6f0e9fbb4 100644 --- a/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql +++ b/packages/bigframes/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_to_json/out.sql @@ -1,8 +1,8 @@ SELECT - TO_JSON(`int64_col`) AS `int64_col`, - TO_JSON(`float64_col`) AS `float64_col`, - TO_JSON(`bool_col`) AS `bool_col`, + IF(`int64_col` IS NULL, NULL, TO_JSON(`int64_col`)) AS `int64_col`, + IF(`float64_col` IS NULL, NULL, TO_JSON(`float64_col`)) AS `float64_col`, + IF(`bool_col` IS NULL, NULL, TO_JSON(`bool_col`)) AS `bool_col`, SAFE.PARSE_JSON(`string_col`) AS `string_col`, - TO_JSON(`bool_col`) AS `bool_w_safe`, + IF(`bool_col` IS NULL, NULL, TO_JSON(`bool_col`)) AS `bool_w_safe`, SAFE.PARSE_JSON(`string_col`) AS `string_w_safe` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0` \ No newline at end of file diff --git a/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py b/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py index bd40d05154b9..c94a1cbc41af 100644 --- a/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py @@ -116,7 +116,7 @@ class DisplayOptions: >>> bpd.options.display.progress_bar = "terminal" # doctest: +SKIP """ - repr_mode: Literal["head", "deferred", "anywidget"] = "head" + repr_mode: Literal["head", "deferred", "anywidget"] = "deferred" """ Determines how to display a DataFrame or Series. Default "head". From e15ed606d0ebc2c9d3d01094c321b5cb364cbefd Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Jun 2026 01:17:01 +0000 Subject: [PATCH 7/8] revert config default --- .../third_party/bigframes_vendored/pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py b/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py index c94a1cbc41af..bd40d05154b9 100644 --- a/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/packages/bigframes/third_party/bigframes_vendored/pandas/core/config_init.py @@ -116,7 +116,7 @@ class DisplayOptions: >>> bpd.options.display.progress_bar = "terminal" # doctest: +SKIP """ - repr_mode: Literal["head", "deferred", "anywidget"] = "deferred" + repr_mode: Literal["head", "deferred", "anywidget"] = "head" """ Determines how to display a DataFrame or Series. Default "head". From ef077b38dd30156cba47236bc1fb1ecc5b9ba772 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Jun 2026 17:26:25 +0000 Subject: [PATCH 8/8] lint --- packages/bigframes/bigframes/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index 2d857ecee9e0..e64e640287f2 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -451,7 +451,7 @@ def astype( safe_cast = errors == "null" - exprs = [] + exprs: list[ex.Expression] = [] for col_id, col_label in zip( self._block.value_columns, self._block.column_labels ): @@ -465,6 +465,7 @@ def astype( else: to_type = bigframes.dtypes.bigframes_type(dtype) + op: ops.UnaryOp if to_type == bigframes.dtypes.JSON_DTYPE: op = ops.ToJSON(safe=safe_cast) elif from_type == bigframes.dtypes.JSON_DTYPE: