From 5e8c15849656f15acf06435bffe978673c69c2f8 Mon Sep 17 00:00:00 2001 From: Tanner Clary Date: Mon, 11 Sep 2023 19:00:07 -0700 Subject: [PATCH] [CALCITE-6001] Add dialect-specific encoding for string literals --- babel/src/test/resources/sql/redshift.iq | 4 +- core/src/main/codegen/templates/Parser.jj | 10 ++- .../calcite/sql/SqlCharStringLiteral.java | 1 + .../org/apache/calcite/sql/SqlDialect.java | 67 +++++++++++++------ .../sql/dialect/BigQuerySqlDialect.java | 1 + .../calcite/sql/dialect/HiveSqlDialect.java | 1 + .../calcite/sql/dialect/MysqlSqlDialect.java | 1 + .../sql/dialect/RedshiftSqlDialect.java | 1 + .../calcite/sql/dialect/SparkSqlDialect.java | 1 + .../calcite/sql/fun/SqlCastFunction.java | 12 ---- .../sql/fun/SqlLiteralChainOperator.java | 1 - .../sql/parser/SqlAbstractParserImpl.java | 7 ++ .../apache/calcite/sql/parser/SqlParser.java | 13 ++++ .../rel/rel2sql/RelToSqlConverterTest.java | 28 +++++++- .../org/apache/calcite/tools/PlannerTest.java | 5 +- .../calcite/sql/parser/SqlParserFixture.java | 12 ++++ .../calcite/sql/parser/SqlParserTest.java | 44 +++++++++--- 17 files changed, 162 insertions(+), 47 deletions(-) diff --git a/babel/src/test/resources/sql/redshift.iq b/babel/src/test/resources/sql/redshift.iq index 4917e3124a0..e0ef58abd13 100755 --- a/babel/src/test/resources/sql/redshift.iq +++ b/babel/src/test/resources/sql/redshift.iq @@ -1777,7 +1777,7 @@ SELECT "LENGTH"('ily') -- returns 8 (cf OCTET_LENGTH) select length('français'); -SELECT "LENGTH"(u&'fran\00e7ais') +SELECT "LENGTH"('français') !explain-validated-on calcite # LOWER @@ -1824,7 +1824,7 @@ f7415e33f972c03abd4f3fed36748f7a # OCTET_LENGTH -- returns 9 (cf LENGTH) select octet_length('français'); -SELECT OCTET_LENGTH(CAST(u&'fran\00e7ais' AS VARBINARY)) +SELECT OCTET_LENGTH(CAST('français' AS VARBINARY)) !explain-validated-on calcite # POSITION is a synonym for STRPOS diff --git a/core/src/main/codegen/templates/Parser.jj b/core/src/main/codegen/templates/Parser.jj index f03f36409fa..c8923d23b88 100644 --- a/core/src/main/codegen/templates/Parser.jj +++ b/core/src/main/codegen/templates/Parser.jj @@ -164,6 +164,7 @@ public class ${parser.class} extends SqlAbstractParserImpl private Casing unquotedCasing; private Casing quotedCasing; + private String charset; private int identifierMaxLength; private SqlConformance conformance; @@ -178,6 +179,7 @@ public class ${parser.class} extends SqlAbstractParserImpl ((SourceStringReader) reader).getSourceString(); parser.setOriginalSql(sql); } + parser.setConformance(SqlConformanceEnum.DEFAULT); return parser; } }; @@ -222,6 +224,10 @@ public class ${parser.class} extends SqlAbstractParserImpl this.unquotedCasing = unquotedCasing; } + public void setCharset(String charset) { + this.charset = charset; + } + public void setIdentifierMaxLength(int identifierMaxLength) { this.identifierMaxLength = identifierMaxLength; } @@ -4519,7 +4525,7 @@ SqlNode StringLiteral() : String p; final List frags; char unicodeEscapeChar = 0; - String charSet = null; + String charSet = this.charset; SqlCharStringLiteral literal; } { @@ -4653,7 +4659,7 @@ SqlNode StringLiteral() : p = SqlParserUtil.stripQuotes(getToken(0).image, DQ, DQ, "\\\"", Casing.UNCHANGED); try { - return SqlLiteral.createCharString(p, charSet, getPos()); + return literal = SqlLiteral.createCharString(p, charSet, getPos()); } catch (java.nio.charset.UnsupportedCharsetException e) { throw SqlUtil.newContextException(getPos(), RESOURCE.unknownCharacterSet(charSet)); diff --git a/core/src/main/java/org/apache/calcite/sql/SqlCharStringLiteral.java b/core/src/main/java/org/apache/calcite/sql/SqlCharStringLiteral.java index 46fbc332d7f..8a6fca3d9bc 100644 --- a/core/src/main/java/org/apache/calcite/sql/SqlCharStringLiteral.java +++ b/core/src/main/java/org/apache/calcite/sql/SqlCharStringLiteral.java @@ -78,6 +78,7 @@ private NlsString getValueNonNull() { writer.literal( writer.getDialect().quoteStringLiteral(stringValue)); } + writer.literal(nlsString.asSql(true, true, writer.getDialect())); } diff --git a/core/src/main/java/org/apache/calcite/sql/SqlDialect.java b/core/src/main/java/org/apache/calcite/sql/SqlDialect.java index ae07d64cf2e..484407193be 100644 --- a/core/src/main/java/org/apache/calcite/sql/SqlDialect.java +++ b/core/src/main/java/org/apache/calcite/sql/SqlDialect.java @@ -154,6 +154,7 @@ public class SqlDialect { private final Casing unquotedCasing; private final Casing quotedCasing; private final boolean caseSensitive; + private final String charset; //~ Constructors ----------------------------------------------------------- @@ -238,6 +239,7 @@ public SqlDialect(Context context) { this.unquotedCasing = requireNonNull(context.unquotedCasing()); this.quotedCasing = requireNonNull(context.quotedCasing()); this.caseSensitive = context.caseSensitive(); + this.charset = context.charset(); } //~ Methods ---------------------------------------------------------------- @@ -245,7 +247,7 @@ public SqlDialect(Context context) { /** Creates an empty context. Use {@link #EMPTY_CONTEXT} to reference the instance. */ private static Context emptyContext() { return new ContextImpl(DatabaseProduct.UNKNOWN, null, null, -1, -1, - "'", "''", null, null, + "'", "''", null, null, "ISO-8859-1", Casing.UNCHANGED, Casing.TO_UPPER, true, SqlConformanceEnum.DEFAULT, NullCollation.HIGH, RelDataTypeSystemImpl.DEFAULT, JethroDataSqlDialect.JethroInfo.EMPTY); @@ -433,7 +435,9 @@ public void quoteStringLiteral(StringBuilder buf, @Nullable String charsetName, if (containsNonAscii(val) && charsetName == null) { quoteStringLiteralUnicode(buf, val); } else { - if (charsetName != null) { + // Don't append charset if it matches dialect default, e.g. BigQuery shouldn't append _UTF-8 + // because that is the default + if (charsetName != null && !charsetName.equals(getCharset())) { buf.append("_"); buf.append(charsetName); } @@ -1180,7 +1184,8 @@ public SqlParser.Config configureParser(SqlParser.Config config) { .withUnquotedCasing(getUnquotedCasing()) .withCaseSensitive(isCaseSensitive()) .withConformance(getConformance()) - .withCharLiteralStyles(ImmutableSet.of(CharLiteralStyle.STANDARD)); + .withCharLiteralStyles(ImmutableSet.of(CharLiteralStyle.STANDARD)) + .withCharset(getCharset()); } @Deprecated // to be removed before 2.0 @@ -1240,6 +1245,11 @@ public Casing getQuotedCasing() { return quotedCasing; } + /** Returns charset to use for encoding. */ + public String getCharset() { + return charset; + } + /** Returns whether matching of identifiers is case-sensitive. */ public boolean isCaseSensitive() { return caseSensitive; @@ -1428,6 +1438,8 @@ Context withLiteralEscapedQuoteString( @Nullable String identifierEscapedQuoteString(); Context withIdentifierEscapedQuoteString( @Nullable String identifierEscapedQuoteString); + String charset(); + Context withCharset(String charset); Casing unquotedCasing(); Context withUnquotedCasing(Casing unquotedCasing); Casing quotedCasing(); @@ -1455,6 +1467,7 @@ private static class ContextImpl implements Context { private final String literalEscapedQuoteString; private final @Nullable String identifierQuoteString; private final @Nullable String identifierEscapedQuoteString; + private final String charset; private final Casing unquotedCasing; private final Casing quotedCasing; private final boolean caseSensitive; @@ -1468,7 +1481,7 @@ private ContextImpl(DatabaseProduct databaseProduct, int databaseMajorVersion, int databaseMinorVersion, String literalQuoteString, String literalEscapedQuoteString, @Nullable String identifierQuoteString, - @Nullable String identifierEscapedQuoteString, + @Nullable String identifierEscapedQuoteString, String charset, Casing quotedCasing, Casing unquotedCasing, boolean caseSensitive, SqlConformance conformance, NullCollation nullCollation, RelDataTypeSystem dataTypeSystem, @@ -1482,6 +1495,7 @@ private ContextImpl(DatabaseProduct databaseProduct, this.literalEscapedQuoteString = literalEscapedQuoteString; this.identifierQuoteString = identifierQuoteString; this.identifierEscapedQuoteString = identifierEscapedQuoteString; + this.charset = charset; this.quotedCasing = requireNonNull(quotedCasing, "quotedCasing"); this.unquotedCasing = requireNonNull(unquotedCasing, "unquotedCasing"); this.caseSensitive = caseSensitive; @@ -1501,7 +1515,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1514,7 +1528,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1527,7 +1541,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1540,7 +1554,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1553,7 +1567,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1566,7 +1580,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1580,7 +1594,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1594,7 +1608,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1608,7 +1622,20 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, + conformance, nullCollation, dataTypeSystem, jethroInfo); + } + + @Override public String charset() { + return charset; + } + + @Override public Context withCharset(String charset) { + return new ContextImpl(databaseProduct, databaseProductName, + databaseVersion, databaseMajorVersion, databaseMinorVersion, + literalQuoteString, literalEscapedQuoteString, + identifierQuoteString, identifierEscapedQuoteString, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1621,7 +1648,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1634,7 +1661,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1647,7 +1674,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1660,7 +1687,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1674,7 +1701,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1687,7 +1714,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } @@ -1700,7 +1727,7 @@ private ContextImpl(DatabaseProduct databaseProduct, databaseVersion, databaseMajorVersion, databaseMinorVersion, literalQuoteString, literalEscapedQuoteString, identifierQuoteString, identifierEscapedQuoteString, - quotedCasing, unquotedCasing, caseSensitive, + charset, quotedCasing, unquotedCasing, caseSensitive, conformance, nullCollation, dataTypeSystem, jethroInfo); } } diff --git a/core/src/main/java/org/apache/calcite/sql/dialect/BigQuerySqlDialect.java b/core/src/main/java/org/apache/calcite/sql/dialect/BigQuerySqlDialect.java index 15ae0bad0b4..751ae3d05b2 100644 --- a/core/src/main/java/org/apache/calcite/sql/dialect/BigQuerySqlDialect.java +++ b/core/src/main/java/org/apache/calcite/sql/dialect/BigQuerySqlDialect.java @@ -68,6 +68,7 @@ public class BigQuerySqlDialect extends SqlDialect { .withLiteralEscapedQuoteString("\\'") .withIdentifierQuoteString("`") .withIdentifierEscapedQuoteString("\\`") + .withCharset("UTF-8") .withNullCollation(NullCollation.LOW) .withUnquotedCasing(Casing.UNCHANGED) .withQuotedCasing(Casing.UNCHANGED) diff --git a/core/src/main/java/org/apache/calcite/sql/dialect/HiveSqlDialect.java b/core/src/main/java/org/apache/calcite/sql/dialect/HiveSqlDialect.java index 2feacf6a6a0..c6147217df3 100644 --- a/core/src/main/java/org/apache/calcite/sql/dialect/HiveSqlDialect.java +++ b/core/src/main/java/org/apache/calcite/sql/dialect/HiveSqlDialect.java @@ -39,6 +39,7 @@ public class HiveSqlDialect extends SqlDialect { public static final SqlDialect.Context DEFAULT_CONTEXT = SqlDialect.EMPTY_CONTEXT .withDatabaseProduct(SqlDialect.DatabaseProduct.HIVE) + .withCharset("UTF-8") .withNullCollation(NullCollation.LOW); public static final SqlDialect DEFAULT = new HiveSqlDialect(DEFAULT_CONTEXT); diff --git a/core/src/main/java/org/apache/calcite/sql/dialect/MysqlSqlDialect.java b/core/src/main/java/org/apache/calcite/sql/dialect/MysqlSqlDialect.java index b303641b155..b4f993e47b4 100644 --- a/core/src/main/java/org/apache/calcite/sql/dialect/MysqlSqlDialect.java +++ b/core/src/main/java/org/apache/calcite/sql/dialect/MysqlSqlDialect.java @@ -81,6 +81,7 @@ public class MysqlSqlDialect extends SqlDialect { .withIdentifierQuoteString("`") .withDataTypeSystem(MYSQL_TYPE_SYSTEM) .withUnquotedCasing(Casing.UNCHANGED) + .withCharset("UTF-8") .withNullCollation(NullCollation.LOW); public static final SqlDialect DEFAULT = new MysqlSqlDialect(DEFAULT_CONTEXT); diff --git a/core/src/main/java/org/apache/calcite/sql/dialect/RedshiftSqlDialect.java b/core/src/main/java/org/apache/calcite/sql/dialect/RedshiftSqlDialect.java index 4e94977ce30..84168962813 100644 --- a/core/src/main/java/org/apache/calcite/sql/dialect/RedshiftSqlDialect.java +++ b/core/src/main/java/org/apache/calcite/sql/dialect/RedshiftSqlDialect.java @@ -62,6 +62,7 @@ public class RedshiftSqlDialect extends SqlDialect { .withQuotedCasing(Casing.TO_LOWER) .withUnquotedCasing(Casing.TO_LOWER) .withCaseSensitive(false) + .withCharset("UTF-8") .withDataTypeSystem(TYPE_SYSTEM); public static final SqlDialect DEFAULT = new RedshiftSqlDialect(DEFAULT_CONTEXT); diff --git a/core/src/main/java/org/apache/calcite/sql/dialect/SparkSqlDialect.java b/core/src/main/java/org/apache/calcite/sql/dialect/SparkSqlDialect.java index e97cfcac01b..e6515e94c38 100644 --- a/core/src/main/java/org/apache/calcite/sql/dialect/SparkSqlDialect.java +++ b/core/src/main/java/org/apache/calcite/sql/dialect/SparkSqlDialect.java @@ -44,6 +44,7 @@ public class SparkSqlDialect extends SqlDialect { public static final SqlDialect.Context DEFAULT_CONTEXT = SqlDialect.EMPTY_CONTEXT .withDatabaseProduct(SqlDialect.DatabaseProduct.SPARK) + .withCharset("UTF-8") .withNullCollation(NullCollation.LOW); public static final SqlDialect DEFAULT = new SparkSqlDialect(DEFAULT_CONTEXT); diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlCastFunction.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlCastFunction.java index 81da37c7e70..63e52eefcaf 100644 --- a/core/src/main/java/org/apache/calcite/sql/fun/SqlCastFunction.java +++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlCastFunction.java @@ -227,18 +227,6 @@ private static RelDataType createTypeWithNullabilityFromExpr(RelDataTypeFactory } return false; } - if (SqlTypeUtil.areCharacterSetsMismatched( - validatedNodeType, - returnType)) { - if (throwOnFailure) { - // Include full type string to indicate character - // set mismatch. - throw callBinding.newError( - RESOURCE.cannotCastValue(validatedNodeType.getFullTypeString(), - returnType.getFullTypeString())); - } - return false; - } return true; } diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLiteralChainOperator.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLiteralChainOperator.java index 9e23989087f..de7d8906209 100644 --- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLiteralChainOperator.java +++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLiteralChainOperator.java @@ -166,7 +166,6 @@ private static boolean argTypesValid(SqlCallBinding callBinding) { if (operand.i == 0) { collation = nls.getCollation(); - // print with prefix writer.literal(nls.asSql(true, false, writer.getDialect())); } else { // print without prefix diff --git a/core/src/main/java/org/apache/calcite/sql/parser/SqlAbstractParserImpl.java b/core/src/main/java/org/apache/calcite/sql/parser/SqlAbstractParserImpl.java index bd9b0bdb32a..2327faa109a 100644 --- a/core/src/main/java/org/apache/calcite/sql/parser/SqlAbstractParserImpl.java +++ b/core/src/main/java/org/apache/calcite/sql/parser/SqlAbstractParserImpl.java @@ -537,6 +537,13 @@ protected SqlCall createCall( */ public abstract void setUnquotedCasing(Casing unquotedCasing); + /** + * Sets the charset. + * + * @param charset Charset to set. + */ + public abstract void setCharset(String charset); + /** * Sets the maximum length for sql identifier. */ diff --git a/core/src/main/java/org/apache/calcite/sql/parser/SqlParser.java b/core/src/main/java/org/apache/calcite/sql/parser/SqlParser.java index 237a366606c..b2004a5189d 100644 --- a/core/src/main/java/org/apache/calcite/sql/parser/SqlParser.java +++ b/core/src/main/java/org/apache/calcite/sql/parser/SqlParser.java @@ -19,6 +19,7 @@ import org.apache.calcite.avatica.util.Casing; import org.apache.calcite.avatica.util.Quoting; import org.apache.calcite.avatica.util.TimeUnit; +import org.apache.calcite.config.CalciteSystemProperty; import org.apache.calcite.config.CharLiteralStyle; import org.apache.calcite.config.Lex; import org.apache.calcite.rel.type.RelDataTypeSystem; @@ -75,6 +76,7 @@ private SqlParser(SqlAbstractParserImpl parser, parser.setIdentifierMaxLength(config.identifierMaxLength()); parser.setTimeUnitCodes(config.timeUnitCodes()); parser.setConformance(config.conformance()); + parser.setCharset(config.charset()); parser.switchTo(SqlAbstractParserImpl.LexicalState.forConfig(config)); } @@ -288,6 +290,13 @@ public interface Config { /** Sets {@link #unquotedCasing()}. */ Config withUnquotedCasing(Casing casing); + @Value.Default default String charset() { + return CalciteSystemProperty.DEFAULT_CHARSET.value(); + } + + /** Sets {@link #charset()}. */ + Config withCharset(String charset); + @Value.Default default Quoting quoting() { return Quoting.DOUBLE_QUOTE; } @@ -379,6 +388,10 @@ public ConfigBuilder setUnquotedCasing(Casing unquotedCasing) { return setConfig(config.withUnquotedCasing(unquotedCasing)); } + public ConfigBuilder setCharset(String charset) { + return setConfig(config.withCharset(charset)); + } + public ConfigBuilder setQuoting(Quoting quoting) { return setConfig(config.withQuoting(quoting)); } diff --git a/core/src/test/java/org/apache/calcite/rel/rel2sql/RelToSqlConverterTest.java b/core/src/test/java/org/apache/calcite/rel/rel2sql/RelToSqlConverterTest.java index de7b08a6ab2..c2e6af1142e 100644 --- a/core/src/test/java/org/apache/calcite/rel/rel2sql/RelToSqlConverterTest.java +++ b/core/src/test/java/org/apache/calcite/rel/rel2sql/RelToSqlConverterTest.java @@ -288,7 +288,7 @@ private static String toSql(RelNode root, SqlDialect dialect, @Test void testCharset() { sql("select _UTF8'\u4F60\u597D'") .withMysql() // produces a simpler output query - .ok("SELECT _UTF-8'\u4F60\u597D'"); + .ok("SELECT '\u4F60\u597D'"); sql("select _UTF16'" + ConversionUtil.TEST_UNICODE_STRING + "'") .withMysql() .ok("SELECT _UTF-16LE'" + ConversionUtil.TEST_UNICODE_STRING + "'"); @@ -1512,6 +1512,9 @@ private static String toSql(RelNode root, SqlDialect dialect, assertThat(toSql(root), isLinux(expectedSql)); } + + + /** Test case for * [CALCITE-5394] * RelToSql converter fails when semi-join is under a join node. */ @@ -2573,6 +2576,29 @@ private SqlDialect nonOrdinalDialect() { .withBigQuery().ok(expectedBigQuery); } + /** Test case for + * [CALCITE-6001] + * Add withCharset to allow dialect-specific encoding. */ + @Test void testStringLiteralEncoding() { + final SqlParser.Config parserConfig = + BigQuerySqlDialect.DEFAULT.configureParser(SqlParser.config()); + final String query = "select 'ק' from `foodmart`.`product`"; + final String failedQuery = "select 'ק' from \"product\""; + final String expectedBigQuery = "SELECT 'ק'\nFROM foodmart.product"; + final String expectedMySql = "SELECT 'ק'\nFROM `foodmart`.`product`"; + final String expectedRedshift = "SELECT 'ק'\nFROM \"foodmart\".\"product\""; + // Dialects that do not use UTF-8 as their default should have a prefix appended + final String expectedOracle = "SELECT _UTF-8'ק'\nFROM \"foodmart\".\"product\""; + + sql(failedQuery).throws_("Failed to encode 'ק' in character set 'ISO-8859-1'"); + sql(query).parserConfig(parserConfig).withBigQuery().ok(expectedBigQuery); + sql(query).parserConfig(parserConfig).withHive().ok(expectedBigQuery); + sql(query).parserConfig(parserConfig).withMysql().ok(expectedMySql); + sql(query).parserConfig(parserConfig).withRedshift().ok(expectedRedshift); + sql(query).parserConfig(parserConfig).withSpark().ok(expectedBigQuery); + sql(query).parserConfig(parserConfig).withOracle().ok(expectedOracle); + } + @Test void testIdentifier() { // Note that IGNORE is reserved in BigQuery but not in standard SQL final String query = "select *\n" diff --git a/core/src/test/java/org/apache/calcite/tools/PlannerTest.java b/core/src/test/java/org/apache/calcite/tools/PlannerTest.java index 92464f6aa39..2208ccabd15 100644 --- a/core/src/test/java/org/apache/calcite/tools/PlannerTest.java +++ b/core/src/test/java/org/apache/calcite/tools/PlannerTest.java @@ -65,6 +65,7 @@ import org.apache.calcite.sql.SqlFunctionCategory; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.SqlNode; +import org.apache.calcite.sql.dialect.HiveSqlDialect; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.parser.SqlParseException; import org.apache.calcite.sql.parser.SqlParser; @@ -820,7 +821,9 @@ private void runDuplicateSortCheck(String sql, String plan) throws Exception { /** Tests that Hive dialect does not generate "AS". */ @Test void testHiveDialect() throws SqlParseException { - Planner planner = getPlanner(null); + final SqlParser.Config parserConfig = + HiveSqlDialect.DEFAULT.configureParser(SqlParser.config()); + Planner planner = getPlanner(null, parserConfig); final String sql = "select * from (select * from \"emps\") as t\n" + "where \"name\" like '%e%'"; SqlNode parse = planner.parse(sql); diff --git a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserFixture.java b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserFixture.java index 34fc7ac1140..1f50cf8131d 100644 --- a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserFixture.java +++ b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserFixture.java @@ -161,6 +161,18 @@ public SqlParserFixture withDialect(SqlDialect dialect) { convertToLinux, parserChecker); } + /** Applies this fixture to some code for each of the given libraries. */ + void forEachDialect(Iterable dialects, + Consumer consumer) { + dialects.forEach(d -> { + try { + consumer.accept(this.withDialect(d)); + } catch (Exception e) { + throw new RuntimeException("for library " + d, e); + } + }); + } + /** * Creates a copy of this fixture with a new test factory. */ diff --git a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java index 2086a300028..954d9cad44f 100644 --- a/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java +++ b/testkit/src/main/java/org/apache/calcite/sql/parser/SqlParserTest.java @@ -68,6 +68,7 @@ import java.util.function.UnaryOperator; import java.util.stream.Collectors; +import static org.apache.calcite.linq4j.tree.Expressions.list; import static org.apache.calcite.util.Static.RESOURCE; import static org.apache.calcite.util.Util.toLinux; @@ -604,6 +605,9 @@ public class SqlParserTest { SqlDialect.DatabaseProduct.BIG_QUERY.getDialect(); private static final SqlDialect CALCITE = SqlDialect.DatabaseProduct.CALCITE.getDialect(); + + private static final SqlDialect HIVE = + SqlDialect.DatabaseProduct.HIVE.getDialect(); private static final SqlDialect MSSQL = SqlDialect.DatabaseProduct.MSSQL.getDialect(); private static final SqlDialect MYSQL = @@ -615,6 +619,9 @@ public class SqlParserTest { private static final SqlDialect REDSHIFT = SqlDialect.DatabaseProduct.REDSHIFT.getDialect(); + private static final SqlDialect SPARK = + SqlDialect.DatabaseProduct.SPARK.getDialect(); + /** Creates the test fixture that determines the behavior of tests. * Sub-classes that, say, test different parser implementations should * override. */ @@ -1828,6 +1835,27 @@ void checkPeriodPredicate(Checker checker) { .ok("CAST('foo' AS `BAR`)"); } + /** Test case for + * [CALCITE-6001] + * Add withCharset to allow dialect-specific encoding. */ + @Test void testDialectSpecificEncoding() { + final SqlParserFixture f0 = fixture(); + // UTF-8 character that the Calcite default (ISO-8859-1) would not be able to encode. + f0.sql("select 'ק'") + .fails("Failed to encode 'ק' in character set 'ISO-8859-1'"); + final Consumer consumer = f -> { + // UTF-8 + f.sql("select 'ק'").ok("SELECT 'ק'"); + // ASCII 7-bit + f.sql("select 'm'").ok("SELECT 'm'"); + // ASCII 8-bit + f.sql("select 'Ç'").ok("SELECT 'Ç'"); + }; + // The following dialects use UTF-8 as their default charset so the tests are ran against + // each of them for consistency. + f0.forEachDialect(list(BIG_QUERY, HIVE, MYSQL, REDSHIFT, SPARK), consumer); + } + @Test void testCastFails() { expr("cast(x as time with ^time^ zone)") .fails("(?s).*Encountered \"time\" at .*"); @@ -3457,9 +3485,9 @@ void checkPeriodPredicate(Checker checker) { expr("'abba'\n'abba'").same(); expr("'abba'\n'0001'").same(); expr("N'yabba'\n'dabba'\n'doo'") - .ok("_ISO-8859-1'yabba'\n'dabba'\n'doo'"); + .ok("'yabba'\n'dabba'\n'doo'"); expr("_iso-8859-1'yabba'\n'dabba'\n'don''t'") - .ok("_ISO-8859-1'yabba'\n'dabba'\n'don''t'"); + .ok("'yabba'\n'dabba'\n'don''t'"); expr("x'01aa'\n'03ff'") .ok("X'01AA'\n'03FF'"); @@ -5120,19 +5148,19 @@ void checkPeriodPredicate(Checker checker) { expr("_latin1'hi'") .ok("_LATIN1'hi'"); expr("N'is it a plane? no it''s superman!'") - .ok("_ISO-8859-1'is it a plane? no it''s superman!'"); + .ok("'is it a plane? no it''s superman!'"); expr("n'lowercase n'") - .ok("_ISO-8859-1'lowercase n'"); + .ok("'lowercase n'"); expr("'boring string'").same(); expr("_iSo-8859-1'bye'") - .ok("_ISO-8859-1'bye'"); + .ok("'bye'"); expr("'three'\n' blind'\n' mice'").same(); expr("'three' -- comment\n' blind'\n' mice'") .ok("'three'\n' blind'\n' mice'"); expr("N'bye' \t\r\f\f\n' bye'") - .ok("_ISO-8859-1'bye'\n' bye'"); + .ok("'bye'\n' bye'"); expr("_iso-8859-1'bye'\n\n--\n-- this is a comment\n' bye'") - .ok("_ISO-8859-1'bye'\n' bye'"); + .ok("'bye'\n' bye'"); expr("_utf8'hi'") .ok("_UTF8'hi'"); @@ -5156,7 +5184,7 @@ void checkPeriodPredicate(Checker checker) { // valid syntax, but should give a validator error sql("select (N'1' '2') from t") - .ok("SELECT _ISO-8859-1'1'\n" + .ok("SELECT '1'\n" + "'2'\n" + "FROM `T`"); }