Skip to content

Commit

Permalink
Fix the nvarchar-varbinary casting (#3072)
Browse files Browse the repository at this point in the history
PROBLEM: while casting nvarchar to varbinary we were considering the UTF8 encoding as input encoding in Babelfish
where as in TSQL we use UTF16 encoding fir nvarchar irrespective of input encoding.

RCA: we were considering varchar and nvarchar as same, whereas we should use input encoding for varchar and UTF16
encoding for nvarchar.

FIX: So we need to identify that if the input is nvarchar then we will do the UTF16 encoding.

For a casting like nvarchar->varbinary->nvarchar, now since for the casting we are encoding the input string into UTF16
encoding via function nvarcharvarbinary, so while converting varbinary-> nvarchar we will use the function
varbinarynvarchar where we will convert UTF16 encoding to UTF8 with null padding.

So we created a function nvarcharvarbinary and varbinarynvarchar to handle nvarchar<-> varbinary to and fro casting.
And for this casting we have specifically applied a condition that we will not convert the datatype to basetype before choosing the casting function

Task: BABEL-4891
Signed-off-by: Pranav Jain <[email protected]>
  • Loading branch information
pranavJ23 committed Jan 6, 2025
1 parent a385957 commit b466789
Show file tree
Hide file tree
Showing 61 changed files with 3,989 additions and 471 deletions.
10 changes: 10 additions & 0 deletions contrib/babelfishpg_common/sql/binary.sql
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE CAST (sys.VARCHAR AS sys.BBF_BINARY)
WITH FUNCTION sys.varcharbinary (sys.VARCHAR, integer, boolean) AS ASSIGNMENT;

CREATE OR REPLACE FUNCTION sys.nvarcharbinary(sys.NVARCHAR, integer, boolean)
RETURNS sys.BBF_BINARY
AS 'babelfishpg_common', 'nvarcharbinary'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.varcharbinary(pg_catalog.VARCHAR, integer, boolean)
RETURNS sys.BBF_BINARY
AS 'babelfishpg_common', 'varcharbinary'
Expand Down Expand Up @@ -99,6 +104,11 @@ LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE CAST (sys.BBF_BINARY AS sys.VARCHAR)
WITH FUNCTION sys.binarysysvarchar (sys.BBF_BINARY, integer, boolean) AS IMPLICIT;

CREATE OR REPLACE FUNCTION sys.binarysysnvarchar(sys.BBF_BINARY, integer, boolean)
RETURNS sys.NVARCHAR
AS 'babelfishpg_common', 'varbinarynvarchar'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.binaryvarchar(sys.BBF_BINARY, integer, boolean)
RETURNS pg_catalog.VARCHAR
AS 'babelfishpg_common', 'varbinaryvarchar'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,26 @@

SELECT set_config('search_path', 'sys, '||current_setting('search_path'), false);

CREATE OR REPLACE FUNCTION sys.nvarcharvarbinary(sys.NVARCHAR, integer, boolean)
RETURNS sys.BBF_VARBINARY
AS 'babelfishpg_common', 'nvarcharvarbinary'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.varbinarysysnvarchar(sys.BBF_VARBINARY, integer, boolean)
RETURNS sys.NVARCHAR
AS 'babelfishpg_common', 'varbinarynvarchar'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.binarysysnvarchar(sys.BBF_BINARY, integer, boolean)
RETURNS sys.NVARCHAR
AS 'babelfishpg_common', 'varbinarynvarchar'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.nvarcharbinary(sys.NVARCHAR, integer, boolean)
RETURNS sys.BBF_BINARY
AS 'babelfishpg_common', 'nvarcharbinary'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.smalldatetime_date_cmp(sys.SMALLDATETIME, date)
RETURNS INT4
AS 'timestamp_cmp_date'
Expand Down
10 changes: 10 additions & 0 deletions contrib/babelfishpg_common/sql/varbinary.sql
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE CAST (sys.BBF_VARBINARY AS pg_catalog.BYTEA)
WITH FUNCTION sys.varbinarybytea(sys.BBF_VARBINARY, integer, boolean) AS ASSIGNMENT;

CREATE OR REPLACE FUNCTION sys.nvarcharvarbinary(sys.NVARCHAR, integer, boolean)
RETURNS sys.BBF_VARBINARY
AS 'babelfishpg_common', 'nvarcharvarbinary'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.varcharvarbinary(sys.VARCHAR, integer, boolean)
RETURNS sys.BBF_VARBINARY
AS 'babelfishpg_common', 'varcharvarbinary'
Expand Down Expand Up @@ -111,6 +116,11 @@ LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE CAST (sys.BBF_VARBINARY AS sys.VARCHAR)
WITH FUNCTION sys.varbinarysysvarchar (sys.BBF_VARBINARY, integer, boolean) AS IMPLICIT;

CREATE OR REPLACE FUNCTION sys.varbinarysysnvarchar(sys.BBF_VARBINARY, integer, boolean)
RETURNS sys.NVARCHAR
AS 'babelfishpg_common', 'varbinarynvarchar'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OR REPLACE FUNCTION sys.varbinaryvarchar(sys.BBF_VARBINARY, integer, boolean)
RETURNS pg_catalog.VARCHAR
AS 'babelfishpg_common', 'varbinaryvarchar'
Expand Down
201 changes: 201 additions & 0 deletions contrib/babelfishpg_common/src/varbinary.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,13 @@
#include "utils/pg_locale.h"
#include "utils/sortsupport.h"
#include "utils/varlena.h"
#include "lib/stringinfo.h"

#include "instr.h"
#include "logical.h"
#include "varchar.h"
#include "babelfishpg_common.h"
#include "typecode.h"

PG_FUNCTION_INFO_V1(varbinaryin);
PG_FUNCTION_INFO_V1(varbinaryout);
Expand All @@ -56,8 +60,11 @@ PG_FUNCTION_INFO_V1(varbinaryrowversion);
PG_FUNCTION_INFO_V1(rowversionbinary);
PG_FUNCTION_INFO_V1(rowversionvarbinary);
PG_FUNCTION_INFO_V1(varcharvarbinary);
PG_FUNCTION_INFO_V1(nvarcharvarbinary);
PG_FUNCTION_INFO_V1(bpcharvarbinary);
PG_FUNCTION_INFO_V1(nvarcharbinary);
PG_FUNCTION_INFO_V1(varbinaryvarchar);
PG_FUNCTION_INFO_V1(varbinarynvarchar);
PG_FUNCTION_INFO_V1(varcharbinary);
PG_FUNCTION_INFO_V1(bpcharbinary);
PG_FUNCTION_INFO_V1(varcharrowversion);
Expand Down Expand Up @@ -721,6 +728,84 @@ varcharvarbinary(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(result);
}

/*
* For nvarchar we need to convert the input string to UTF-16 encoding irrespective of input encoding
* So the source string is in UTF-8 encoding, we will convert it to UTF-16 encoding
*/
Datum
nvarcharvarbinary(PG_FUNCTION_ARGS)
{
VarChar *source = PG_GETARG_VARCHAR_PP(0);
char *data = VARDATA_ANY(source); /* Source string is UTF-8 */
char *encoded_data;
char *rp;
size_t len = VARSIZE_ANY_EXHDR(source);
int32 typmod = PG_GETARG_INT32(1);
bool isExplicit = PG_GETARG_BOOL(2);
int32 maxlen;
bytea *result;
int encodedByteLen;
StringInfoData buf;
MemoryContext ccxt = CurrentMemoryContext;

if (!isExplicit)
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("Implicit conversion from data type nvarchar to "
"varbinary is not allowed. Use the CONVERT function "
"to run this query.")));

initStringInfo(&buf);
PG_TRY();
{
/*
* For nvarchar convert the string to UTF16 from UTF8 irrespective of input encoding via TsqlUTF8toUTF16StringInfo()
* For this we need to prepare a StringInfoData() and assign the encoded_data,
* encodedByteLen from the string info data we prepared
*/
TsqlUTF8toUTF16StringInfo(&buf, data, len);
encoded_data = buf.data;
encodedByteLen= buf.len;
}
PG_CATCH();
{
MemoryContext ectx;
ErrorData *errorData;

ectx = MemoryContextSwitchTo(ccxt);
errorData = CopyErrorData();
FlushErrorState();
MemoryContextSwitchTo(ectx);

ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("Failed to convert from data type nvarchar to varbinary, %s",
errorData->message)));
}
PG_END_TRY();

/*
* If typmod is -1 (or invalid), use the actual length
* Length should be checked after encoding into server encoding
*/
if (typmod < (int32) VARHDRSZ)
maxlen = encodedByteLen;
else
maxlen = typmod - VARHDRSZ;

if (encodedByteLen > maxlen)
encodedByteLen = maxlen;

result = (bytea *) palloc0(encodedByteLen + VARHDRSZ);
SET_VARSIZE(result, encodedByteLen + VARHDRSZ);

rp = VARDATA(result);
memcpy(rp, encoded_data, encodedByteLen);
pfree(buf.data);

PG_RETURN_BYTEA_P(result);
}

Datum
bpcharvarbinary(PG_FUNCTION_ARGS)
{
Expand Down Expand Up @@ -835,6 +920,79 @@ varbinaryvarchar(PG_FUNCTION_ARGS)
PG_RETURN_VARCHAR_P(result);
}

Datum
varbinarynvarchar(PG_FUNCTION_ARGS)
{
bytea *source = PG_GETARG_BYTEA_PP(0);
char *data = VARDATA_ANY(source);
VarChar *result;
char *encoded_result;
size_t len = VARSIZE_ANY_EXHDR(source);
int32 typmod = -1;
int maxlen = -1;
int encodedByteLen;
StringInfoData buf;
char *paddedData = (char*)palloc0(len+1);
MemoryContext ccxt = CurrentMemoryContext;

typmod = PG_GETARG_INT32(1);
maxlen = typmod - VARHDRSZ;

/*
* Converts UTF-16 to UTF-8, handling odd-length inputs by padding.
* Respects maxlen if specified, otherwise processes full input.
* Uses TsqlUTF16toUTF8StringInfo for conversion, with error handling via PG_TRY.
*/

/* truncating NULL bytes from end */
while(len>0 && data[len-1] == '\0')
len -= 1;

/* Do the Padding if lenngth is odd */
memcpy(paddedData, data, len);
if(len % 2 != 0)
len = len + 1;

if(!(maxlen < 0 || (len >> 1) <= maxlen))
{
len = maxlen << 1;
}

PG_TRY();
{
/* Converts UTF-16 to UTF-8 using TsqlUTF16toUTF8StringInfo */
initStringInfo(&buf);
TsqlUTF16toUTF8StringInfo(&buf, paddedData, len);
encoded_result = buf.data;
encodedByteLen= buf.len;
}


PG_CATCH();
{
MemoryContext ectx;
ErrorData *errorData;

ectx = MemoryContextSwitchTo(ccxt);
errorData = CopyErrorData();
FlushErrorState();
MemoryContextSwitchTo(ectx);

ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("Failed to convert from data type varbinary to nvarchar, %s",
errorData->message)));
}
PG_END_TRY();

result = (VarChar *) cstring_to_text_with_len(encoded_result, encodedByteLen);
pfree(buf.data);
pfree(paddedData);

PG_RETURN_VARCHAR_P(result);
}


Datum
varcharbinary(PG_FUNCTION_ARGS)
{
Expand Down Expand Up @@ -874,6 +1032,49 @@ varcharbinary(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(result);
}

Datum
nvarcharbinary(PG_FUNCTION_ARGS)
{
VarChar *source = PG_GETARG_VARCHAR_PP(0);
char *data = VARDATA_ANY(source);
char *rp;
size_t len = VARSIZE_ANY_EXHDR(source);
int32 typmod = PG_GETARG_INT32(1);
bool isExplicit = PG_GETARG_BOOL(2);
int32 maxlen;
bytea *result;
StringInfoData buf;

if (!isExplicit)
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("Implicit conversion from data type nvarchar to "
"binary is not allowed. Use the CONVERT function "
"to run this query.")));

initStringInfo(&buf);
TsqlUTF8toUTF16StringInfo(&buf, data, len);
data = buf.data;
len= buf.len;

/* If typmod is -1 (or invalid), use the actual length */
if (typmod < (int32) VARHDRSZ)
maxlen = len;
else
maxlen = typmod - VARHDRSZ;

if (len > maxlen)
len = maxlen;

result = (bytea *) palloc0(maxlen + VARHDRSZ);
SET_VARSIZE(result, maxlen + VARHDRSZ);

rp = VARDATA(result);
memcpy(rp, data, len);
pfree(buf.data);
PG_RETURN_BYTEA_P(result);
}

Datum
bpcharbinary(PG_FUNCTION_ARGS)
{
Expand Down
Loading

0 comments on commit b466789

Please sign in to comment.