Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 85 additions & 60 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1988,48 +1988,60 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < actual_text_len) {
}
if (return_length < actual_text_len) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > actual_text_len)
// case where it needs to copy "fill_text" on the string left. The total number
// of chars to copy is given by (return_length - actual_text_len)
gdv_int32 return_char_length = evaluate_return_char_length(
text_len, actual_text_len, return_length, fill_text, fill_text_len);
char* ret = reinterpret_cast<gdv_binary>(
gdv_fn_context_arena_malloc(context, return_char_length));
}

gdv_int32 chars_to_pad = return_length - actual_text_len;

// FAST PATH: Single-byte fill (most common - space padding)
if (fill_text_len == 1) {
gdv_int32 out_len_bytes = chars_to_pad + text_len;
char* ret =
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, out_len_bytes));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (copied_chars_count < return_length - actual_text_len) {
int32_t char_len;
int32_t fill_index;
// for each char, evaluate its length to consider it when mem copying
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
if (copied_chars_count >= return_length - actual_text_len) {
break;
}
char_len = utf8_char_length(fill_text[fill_index]);
// ignore invalid char on the fill text, considering it as size 1
if (char_len == 0) char_len += 1;
copied_chars_count++;
}
memcpy(ret + copied_chars_position, fill_text, fill_index);
copied_chars_position += fill_index;
}
// after fulfilling the text, copy the main string
memcpy(ret + copied_chars_position, text, text_len);
*out_len = copied_chars_position + text_len;
memset(ret, fill_text[0], chars_to_pad);
memcpy(ret + chars_to_pad, text, text_len);
*out_len = out_len_bytes;
return ret;
}

// GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for buffer size
gdv_int32 return_char_length = evaluate_return_char_length(
text_len, actual_text_len, return_length, fill_text, fill_text_len);
char* ret = reinterpret_cast<gdv_binary>(
gdv_fn_context_arena_malloc(context, return_char_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}

// Fill using doubling strategy (O(log n) memcpy calls)
gdv_int32 total_fill_bytes = return_char_length - text_len;
// Copy only as much of fill_text as we need (may be less than fill_text_len)
gdv_int32 initial_copy = std::min(fill_text_len, total_fill_bytes);
memcpy(ret, fill_text, initial_copy);
gdv_int32 written = initial_copy;
while (written * 2 <= total_fill_bytes) {
memcpy(ret + written, ret, written);
written *= 2;
}
if (written < total_fill_bytes) {
memcpy(ret + written, ret, total_fill_bytes - written);
}

memcpy(ret + total_fill_bytes, text, text_len);
*out_len = return_char_length;
return ret;
}

FORCE_INLINE
Expand All @@ -2054,47 +2066,60 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < actual_text_len) {
}
if (return_length < actual_text_len) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > actual_text_len)
// case where it needs to copy "fill_text" on the string right
gdv_int32 return_char_length = evaluate_return_char_length(
text_len, actual_text_len, return_length, fill_text, fill_text_len);
char* ret = reinterpret_cast<gdv_binary>(
gdv_fn_context_arena_malloc(context, return_char_length));
}

gdv_int32 chars_to_pad = return_length - actual_text_len;

// FAST PATH: Single-byte fill (most common - space padding)
if (fill_text_len == 1) {
gdv_int32 out_len_bytes = chars_to_pad + text_len;
char* ret =
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, out_len_bytes));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// fulfill the initial text copying the main input string
memcpy(ret, text, text_len);
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (actual_text_len + copied_chars_count < return_length) {
int32_t char_len;
int32_t fill_length;
// for each char, evaluate its length to consider it when mem copying
for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
if (actual_text_len + copied_chars_count >= return_length) {
break;
}
char_len = utf8_char_length(fill_text[fill_length]);
// ignore invalid char on the fill text, considering it as size 1
if (char_len == 0) char_len += 1;
copied_chars_count++;
}
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
copied_chars_position += fill_length;
}
*out_len = copied_chars_position + text_len;
memset(ret + text_len, fill_text[0], chars_to_pad);
*out_len = out_len_bytes;
return ret;
}

// GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for buffer size
gdv_int32 return_char_length = evaluate_return_char_length(
text_len, actual_text_len, return_length, fill_text, fill_text_len);
char* ret = reinterpret_cast<gdv_binary>(
gdv_fn_context_arena_malloc(context, return_char_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}

// Copy text first, then fill using doubling strategy
memcpy(ret, text, text_len);
gdv_int32 total_fill_bytes = return_char_length - text_len;
// Copy only as much of fill_text as we need (may be less than fill_text_len)
gdv_int32 initial_copy = std::min(fill_text_len, total_fill_bytes);
memcpy(ret + text_len, fill_text, initial_copy);
gdv_int32 written = initial_copy;
while (written * 2 <= total_fill_bytes) {
memcpy(ret + text_len + written, ret + text_len, written);
written *= 2;
}
if (written < total_fill_bytes) {
memcpy(ret + text_len + written, ret + text_len, total_fill_bytes - written);
}

*out_len = return_char_length;
return ret;
}

FORCE_INLINE
Expand Down
190 changes: 190 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1318,6 +1318,101 @@ TEST(TestStringOps, TestLpadString) {

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
EXPECT_EQ(out_len, 65535 * 4 + 1);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_str[out_len - 1], 'x');
EXPECT_EQ(static_cast<unsigned char>(out_str[0]), 0xF0);
EXPECT_EQ(static_cast<unsigned char>(out_str[1]), 0x9F);
EXPECT_EQ(static_cast<unsigned char>(out_str[2]), 0x98);
EXPECT_EQ(static_cast<unsigned char>(out_str[3]), 0x80);

out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
EXPECT_EQ(out_len, 65535 * 3 + 1);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_str[out_len - 1], 'A');

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len);
EXPECT_EQ(std::string(out_str, out_len), ".X");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len);
EXPECT_EQ(out_len, 65536);
for (int i = 0; i < 100; i++) {
EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i;
}
EXPECT_EQ(out_str[out_len - 1], 'Z');

out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 11, "ab", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "ababababEND");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 10, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "abcabcaEND");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len);
EXPECT_EQ(out_len, 9);
EXPECT_EQ(std::string(out_str, out_len), "αβαβX");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "中文", 6, &out_len);
EXPECT_EQ(out_len, 10);
EXPECT_EQ(std::string(out_str, out_len), "中文中Y");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "abcX");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "abcabcX");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "abcabcabcabcX");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "abcabcabcX");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 129, "ab", 2, &out_len);
EXPECT_EQ(out_len, 129);
EXPECT_EQ(out_str[0], 'a');
EXPECT_EQ(out_str[1], 'b');
EXPECT_EQ(out_str[126], 'a');
EXPECT_EQ(out_str[127], 'b');
EXPECT_EQ(out_str[128], 'E');

out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 127, "ab", 2, &out_len);
EXPECT_EQ(out_len, 127);
EXPECT_EQ(out_str[0], 'a');
EXPECT_EQ(out_str[125], 'b');
EXPECT_EQ(out_str[126], 'E');

out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len);
EXPECT_EQ(out_len, 2);
EXPECT_EQ(std::string(out_str, out_len), "aX");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len);
EXPECT_EQ(out_len, 3);
EXPECT_EQ(std::string(out_str, out_len), "abY");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len);
EXPECT_EQ(out_len, 3);
EXPECT_EQ(std::string(out_str, out_len), "αZ");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "中文字", 9, &out_len);
EXPECT_EQ(out_len, 4);
EXPECT_EQ(std::string(out_str, out_len), "中A");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "中文字", 9, &out_len);
EXPECT_EQ(out_len, 7);
EXPECT_EQ(std::string(out_str, out_len), "中文B");

std::string large_text(5000, 'X');
std::string large_fill;
for (int i = 0; i < 50; ++i) {
large_fill += "α";
}
out_str = lpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001,
large_fill.c_str(), 100, &out_len);
EXPECT_EQ(out_len, 5002);
EXPECT_EQ(std::string(out_str, 2), "α");
EXPECT_EQ(std::string(out_str + 2, 5000), large_text);
}

TEST(TestStringOps, TestRpadString) {
Expand Down Expand Up @@ -1396,6 +1491,101 @@ TEST(TestStringOps, TestRpadString) {

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
EXPECT_EQ(out_len, 1 + 65535 * 4);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_str[0], 'x');
EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 4]), 0xF0);
EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 3]), 0x9F);
EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 2]), 0x98);
EXPECT_EQ(static_cast<unsigned char>(out_str[out_len - 1]), 0x80);

out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
EXPECT_EQ(out_len, 1 + 65535 * 3);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_str[0], 'A');

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "X.");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len);
EXPECT_EQ(out_len, 65536);
EXPECT_EQ(out_str[0], 'Z');
for (int i = 1; i < 100; i++) {
EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i;
}

out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 11, "ab", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "BEGabababab");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 10, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "BEGabcabca");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len);
EXPECT_EQ(out_len, 9);
EXPECT_EQ(std::string(out_str, out_len), "Xαβαβ");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "中文", 6, &out_len);
EXPECT_EQ(out_len, 10);
EXPECT_EQ(std::string(out_str, out_len), "Y中文中");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Xabc");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Xabcabc");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabcabc");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabc");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 129, "ab", 2, &out_len);
EXPECT_EQ(out_len, 129);
EXPECT_EQ(out_str[0], 'S');
EXPECT_EQ(out_str[1], 'a');
EXPECT_EQ(out_str[2], 'b');
EXPECT_EQ(out_str[127], 'a');
EXPECT_EQ(out_str[128], 'b');

out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 127, "ab", 2, &out_len);
EXPECT_EQ(out_len, 127);
EXPECT_EQ(out_str[0], 'S');
EXPECT_EQ(out_str[125], 'a');
EXPECT_EQ(out_str[126], 'b');

out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len);
EXPECT_EQ(out_len, 2);
EXPECT_EQ(std::string(out_str, out_len), "Xa");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len);
EXPECT_EQ(out_len, 3);
EXPECT_EQ(std::string(out_str, out_len), "Yab");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len);
EXPECT_EQ(out_len, 3);
EXPECT_EQ(std::string(out_str, out_len), "Zα");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "中文字", 9, &out_len);
EXPECT_EQ(out_len, 4);
EXPECT_EQ(std::string(out_str, out_len), "A中");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "中文字", 9, &out_len);
EXPECT_EQ(out_len, 7);
EXPECT_EQ(std::string(out_str, out_len), "B中文");

std::string large_text(5000, 'X');
std::string large_fill;
for (int i = 0; i < 50; ++i) {
large_fill += "α";
}
out_str = rpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001,
large_fill.c_str(), 100, &out_len);
EXPECT_EQ(out_len, 5002);
EXPECT_EQ(std::string(out_str, 5000), large_text);
EXPECT_EQ(std::string(out_str + 5000, 2), "α");
}

TEST(TestStringOps, TestRtrim) {
Expand Down
Loading