apache · goel-skd · Jun 19, 2026 · Jun 19, 2026 · Jun 26, 2026 · wgtmac
diff --git a/LICENSE b/LICENSE
@@ -228,3 +228,95 @@ Home page: https://arrow.apache.org/
 License: https://www.apache.org/licenses/LICENSE-2.0
 
 --------------------------------------------------------------------------------
+
+This product bundles utf8proc, which is available under the MIT License:
+
+utf8proc is a software package originally developed by Jan Behrens and the rest
+of the Public Software Group, now maintained by the Julia-language developers.
+All new work on the utf8proc library is licensed under the MIT "expat" license:
+
+Copyright (c) 2014-2021 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas
+Fonseca, and other contributors listed in the git history.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+The original utf8proc is licensed under the same MIT "expat" license:
+
+Copyright (c) 2009, 2013 Public Software Group e. V., Berlin, Germany
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+utf8proc also contains data derived from the Unicode data files. The following
+license applies to that data:
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed under
+the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of the Unicode data files and any associated documentation (the "Data
+Files") or Unicode software and any associated documentation (the
+"Software") to deal in the Data Files or Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, and/or sell copies of the Data Files or Software, and
+to permit persons to whom the Data Files or Software are furnished to do
+so, provided that (a) the above copyright notice(s) and this permission
+notice appear with all copies of the Data Files or Software, (b) both the
+above copyright notice(s) and this permission notice appear in associated
+documentation, and (c) there is clear notice in each modified Data File or
+in the Software as well as in the documentation associated with the Data
+File(s) or Software that the data or software has been modified.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
+INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
+CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
+
+Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be
+registered in some jurisdictions. All other trademarks and registered
+trademarks mentioned herein are the property of their respective owners.
+
+--------------------------------------------------------------------------------
diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake
@@ -421,6 +421,62 @@ function(resolve_croaring_dependency)
       PARENT_SCOPE)
 endfunction()
 
+# ----------------------------------------------------------------------
+# utf8proc
+
+function(resolve_utf8proc_dependency)
+  prepare_fetchcontent()
+
+  if(DEFINED ENV{ICEBERG_UTF8PROC_URL})
+    set(UTF8PROC_URL "$ENV{ICEBERG_UTF8PROC_URL}")
+  else()
+    set(UTF8PROC_URL
+        "https://github.com/JuliaStrings/utf8proc/archive/refs/tags/v2.10.0.tar.gz")
+  endif()
+
+  fetchcontent_declare(utf8proc
+                       ${FC_DECLARE_COMMON_OPTIONS}
+                       URL ${UTF8PROC_URL}
+                           FIND_PACKAGE_ARGS
+                           NAMES
+                           utf8proc
+                           CONFIG)
+  fetchcontent_makeavailable(utf8proc)
+
+  if(utf8proc_SOURCE_DIR)
+    if(NOT TARGET utf8proc::utf8proc)
+      add_library(utf8proc::utf8proc INTERFACE IMPORTED)
+      target_link_libraries(utf8proc::utf8proc INTERFACE utf8proc)
+      target_include_directories(utf8proc::utf8proc INTERFACE ${utf8proc_SOURCE_DIR})
+    endif()
+
+    set(UTF8PROC_VENDORED TRUE)
+    # utf8proc's CMake puts a raw build-tree path in INTERFACE_INCLUDE_DIRECTORIES, which
+    # install(EXPORT) rejects. Wrap it in BUILD_INTERFACE so the export is valid; utf8proc
+    # is a private dependency, so installed consumers never need its headers.
+    set_target_properties(utf8proc
+                          PROPERTIES OUTPUT_NAME "iceberg_vendored_utf8proc"
+                                     POSITION_INDEPENDENT_CODE ON
+                                     INTERFACE_INCLUDE_DIRECTORIES
+                                     "$<BUILD_INTERFACE:${utf8proc_SOURCE_DIR}>")
+    install(TARGETS utf8proc
+            EXPORT iceberg_targets
+            RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}"
+            ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}"
+            LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}")
+  else()
+    set(UTF8PROC_VENDORED FALSE)
+    list(APPEND ICEBERG_SYSTEM_DEPENDENCIES utf8proc)
+  endif()
+
+  set(ICEBERG_SYSTEM_DEPENDENCIES
+      ${ICEBERG_SYSTEM_DEPENDENCIES}
+      PARENT_SCOPE)
+  set(UTF8PROC_VENDORED
+      ${UTF8PROC_VENDORED}
+      PARENT_SCOPE)
+endfunction()
+
 # ----------------------------------------------------------------------
 # nlohmann-json
 
@@ -719,6 +775,7 @@ endfunction()
 resolve_zlib_dependency()
 resolve_nanoarrow_dependency()
 resolve_croaring_dependency()
+resolve_utf8proc_dependency()
 resolve_nlohmann_json_dependency()
 resolve_spdlog_dependency()
 

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
@@ -145,23 +145,27 @@ list(APPEND
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
      nlohmann_json::nlohmann_json
      spdlog::spdlog
+     utf8proc::utf8proc
      ZLIB::ZLIB)
 list(APPEND
      ICEBERG_SHARED_BUILD_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,nanoarrow::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
      nlohmann_json::nlohmann_json
      spdlog::spdlog
+     utf8proc::utf8proc
      ZLIB::ZLIB)
 list(APPEND
      ICEBERG_STATIC_INSTALL_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_static>,nanoarrow::nanoarrow_static,nanoarrow::nanoarrow_shared>>"
      "$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
-     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
+     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
+     "$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
 list(APPEND
      ICEBERG_SHARED_INSTALL_INTERFACE_LIBS
      "$<IF:$<BOOL:${NANOARROW_VENDORED}>,iceberg::nanoarrow_static,$<IF:$<TARGET_EXISTS:nanoarrow::nanoarrow_shared>,nanoarrow::nanoarrow_shared,nanoarrow::nanoarrow_static>>"
      "$<IF:$<BOOL:${NLOHMANN_JSON_VENDORED}>,iceberg::nlohmann_json,$<IF:$<TARGET_EXISTS:nlohmann_json::nlohmann_json>,nlohmann_json::nlohmann_json,nlohmann_json::nlohmann_json>>"
-     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>")
+     "$<IF:$<BOOL:${SPDLOG_VENDORED}>,iceberg::spdlog,spdlog::spdlog>"
+     "$<IF:$<BOOL:${UTF8PROC_VENDORED}>,iceberg::utf8proc,utf8proc::utf8proc>")
 
 add_iceberg_lib(iceberg
                 SOURCES

diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
@@ -190,8 +190,15 @@ nanoarrow_dep = dependency('nanoarrow')
 nlohmann_json_dep = dependency('nlohmann_json')
 spdlog_dep = dependency('spdlog')
 zlib_dep = dependency('zlib')
+utf8proc_dep = dependency('libutf8proc')
 
-iceberg_deps = [nanoarrow_dep, nlohmann_json_dep, spdlog_dep, zlib_dep]
+iceberg_deps = [
+    nanoarrow_dep,
+    nlohmann_json_dep,
+    spdlog_dep,
+    zlib_dep,
+    utf8proc_dep,
+]
 
 iceberg_lib = library(
     'iceberg',

diff --git a/src/iceberg/test/string_util_test.cc b/src/iceberg/test/string_util_test.cc
@@ -41,19 +41,30 @@ TEST(StringUtilsTest, ToUpper) {
   ASSERT_EQ(StringUtils::ToUpper("123"), "123");
 }
 
-// Non-ASCII (multibyte UTF-8) bytes have the high bit set, i.e. are negative when stored
-// in a signed char. Only ASCII letters are converted; multibyte bytes pass through
-// unchanged. The non-ASCII strings are written as explicit UTF-8 byte escapes so the test
-// does not depend on the source-file encoding. See
-// https://github.com/apache/iceberg-cpp/issues/613.
-TEST(StringUtilsTest, NonAsciiPassThrough) {
-  // "Naïve" -> "naïve" (ï = U+00EF = 0xC3 0xAF; only the ASCII letters change).
-  ASSERT_EQ(StringUtils::ToLower("Na\xC3\xAFve"), "na\xC3\xAFve");
-  // "café" -> "CAFé" (é = U+00E9 = 0xC3 0xA9 stays unchanged).
-  ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
-  // "日本語" (0xE6 0x97 0xA5 0xE6 0x9C 0xAC 0xE8 0xAA 0x9E) is returned verbatim.
+// Non-ASCII strings are written as explicit UTF-8 byte escapes so the test does not
+// depend on the source-file encoding. An escape is split before a following hex digit
+// (e.g. "...\x9E" "E") so the \x does not absorb it.
+// See https://github.com/apache/iceberg-cpp/issues/613.
+TEST(StringUtilsTest, ToLowerUnicode) {
+  // "CAFÉ" -> "café" (É U+00C9 = 0xC3 0x89 -> é U+00E9 = 0xC3 0xA9).
+  ASSERT_EQ(StringUtils::ToLower("CAF\xC3\x89"), "caf\xC3\xA9");
+  // "GROẞE" -> "große": capital sharp S (ẞ U+1E9E) lower-cases to ß (U+00DF), not "ss"
+  // as casefolding would produce.
+  ASSERT_EQ(StringUtils::ToLower("GRO\xE1\xBA\x9E"
+                                 "E"),
+            "gro\xC3\x9F"
+            "e");
+  // "日本語" has no case mapping and is returned verbatim.
   ASSERT_EQ(StringUtils::ToLower("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
+  // Invalid UTF-8 (a lone 0xFF byte) is returned unchanged rather than erroring.
+  ASSERT_EQ(StringUtils::ToLower("\xFF"), "\xFF");
+}
+
+// ToUpper is intentionally ASCII-only; non-ASCII (multibyte UTF-8) bytes pass through.
+TEST(StringUtilsTest, ToUpperAsciiOnly) {
+  // "café" -> "CAFé" (é stays unchanged).
+  ASSERT_EQ(StringUtils::ToUpper("caf\xC3\xA9"), "CAF\xC3\xA9");
   ASSERT_EQ(StringUtils::ToUpper("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),
             "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E");
 }
@@ -63,9 +74,15 @@ TEST(StringUtilsTest, EqualsIgnoreCase) {
   ASSERT_TRUE(StringUtils::EqualsIgnoreCase("", ""));
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abcd"));
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("abc", "abd"));
-  // ASCII case is folded; non-ASCII bytes are compared as-is. ("Café" vs "café")
-  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("Caf\xC3\xA9", "caf\xC3\xA9"));
-  // "café" vs "cafe": the multibyte é differs from ASCII 'e'.
+  // Unicode-aware: "CAFÉ" matches "café".
+  ASSERT_TRUE(StringUtils::EqualsIgnoreCase("CAF\xC3\x89", "caf\xC3\xA9"));
+  // "GROẞE" matches "große" under lowercasing (ẞ -> ß).
+  ASSERT_TRUE(
+      StringUtils::EqualsIgnoreCase("GRO\xE1\xBA\x9E"
+                                    "E",
+                                    "gro\xC3\x9F"
+                                    "e"));
+  // Different letters still differ ("café" vs "cafe").
   ASSERT_FALSE(StringUtils::EqualsIgnoreCase("caf\xC3\xA9", "cafe"));
 }
 

diff --git a/src/iceberg/util/string_util.cc b/src/iceberg/util/string_util.cc
@@ -19,10 +19,41 @@
 
 #include "iceberg/util/string_util.h"
 
+#include <utf8proc.h>
+
+#include <array>
+
 #include "iceberg/util/macros.h"
 
 namespace iceberg {
 
+std::string StringUtils::ToLower(std::string_view str) {
+  std::string result;
+  result.reserve(str.size());
+
+  const auto* data = reinterpret_cast<const utf8proc_uint8_t*>(str.data());
+  const auto size = static_cast<utf8proc_ssize_t>(str.size());
+  utf8proc_ssize_t offset = 0;
+  while (offset < size) {
+    utf8proc_int32_t code_point = 0;
+    utf8proc_ssize_t consumed =
+        utf8proc_iterate(data + offset, size - offset, &code_point);
+    if (consumed < 0) {
+      // Invalid UTF-8: return the input unchanged rather than erroring.
+      return std::string(str);
+    }
+    // utf8proc has no string-level lower-case helper, so map and re-encode each code
+    // point individually. utf8proc_tolower is a simple 1:1 mapping (not casefolding).
+    const utf8proc_int32_t lowered = utf8proc_tolower(code_point);
+    std::array<utf8proc_uint8_t, 4> encoded{};
+    const utf8proc_ssize_t written = utf8proc_encode_char(lowered, encoded.data());
+    result.append(reinterpret_cast<const char*>(encoded.data()),
+                  static_cast<size_t>(written));
+    offset += consumed;
+  }
+  return result;
+}
+
 Result<std::vector<uint8_t>> StringUtils::HexStringToBytes(std::string_view hex) {
   if (hex.size() % 2 != 0) [[unlikely]] {
     return InvalidArgument("Hex string must have even length, got: {}", hex.size());

diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h
@@ -20,7 +20,6 @@
 #pragma once
 
 #include <algorithm>
-#include <cctype>
 #include <cerrno>
 #include <charconv>
 #include <ranges>
@@ -41,22 +40,36 @@ concept FromChars = requires(const char* p, T& v) { std::from_chars(p, p, v); };
 
 class ICEBERG_EXPORT StringUtils {
  public:
-  // NOTE: These convert ASCII letters only; all other bytes, including non-ASCII
-  // (multibyte UTF-8) bytes, are passed through unchanged.
-  // See https://github.com/apache/iceberg-cpp/issues/613.
-  static std::string ToLower(std::string_view str) {
-    return str | std::ranges::views::transform(ToLowerAscii) |
-           std::ranges::to<std::string>();
-  }
-
+  /// \brief Lower-case a UTF-8 string using Unicode simple (1:1) case mapping.
+  ///
+  /// Intended for case-insensitive name matching, similar to Iceberg Java's
+  /// toLowerCase(Locale.ROOT). The mapping is locale-independent, matching the intent
+  /// of Locale.ROOT. It uses simple (1:1) case mapping rather than Java's full case
+  /// mapping, so results differ for a few code points; e.g. U+0130 (capital I with dot
+  /// above) maps to U+0069 ("i") here, but to U+0069 U+0307 ("i" + combining dot above)
+  /// in Java. For ASCII and the large majority of letters the two agree.
+  ///
+  /// Invalid UTF-8 input is returned unchanged.
+  /// See https://github.com/apache/iceberg-cpp/issues/613.
+  static std::string ToLower(std::string_view str);
+
+  /// \brief Upper-case the ASCII letters (a-z) in a string; all other bytes, including
+  /// multi-byte UTF-8 sequences, are left unchanged.
+  ///
+  /// Deliberately ASCII-only and, unlike ToLower, not Unicode-aware. It is only used to
+  /// normalize ASCII enum/codec strings (e.g. "gzip" -> "GZIP", "all" -> "ALL") for
+  /// case-insensitive comparison. A Unicode upper-case is intentionally not provided:
+  /// simple case mapping would be wrong for some letters (e.g. "ß" (U+00DF) would stay
+  /// unchanged instead of becoming "SS"), and no caller needs it.
   static std::string ToUpper(std::string_view str) {
     return str | std::ranges::views::transform(ToUpperAscii) |
            std::ranges::to<std::string>();
   }
 
+  /// \brief Case-insensitive equality; compares the ToLower forms of both operands and
+  /// therefore inherits ToLower's Unicode simple-mapping behavior.
   static bool EqualsIgnoreCase(std::string_view lhs, std::string_view rhs) {
-    return std::ranges::equal(
-        lhs, rhs, [](char lc, char rc) { return ToLowerAscii(lc) == ToLowerAscii(rc); });
+    return ToLower(lhs) == ToLower(rhs);
   }
 
   static bool StartsWithIgnoreCase(std::string_view str, std::string_view prefix) {
@@ -134,14 +147,8 @@ class ICEBERG_EXPORT StringUtils {
   }
 
  private:
-  // ASCII-only case conversion using explicit range checks rather than
-  // std::tolower/std::toupper. This is independent of the current C locale and never
-  // touches non-ASCII (high-bit) bytes, so multibyte UTF-8 sequences are preserved. It
-  // also sidesteps the undefined behavior of passing a negative char to <cctype>.
-  static constexpr char ToLowerAscii(char c) noexcept {
-    return (c >= 'A' && c <= 'Z') ? static_cast<char>(c - 'A' + 'a') : c;
-  }
-
+  // Avoids std::toupper, which is locale-dependent and has undefined behavior for
+  // negative char values.
   static constexpr char ToUpperAscii(char c) noexcept {
     return (c >= 'a' && c <= 'z') ? static_cast<char>(c - 'a' + 'A') : c;
   }