sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot.dialects.dialect import ( 9 Dialect, 10 NormalizationStrategy, 11 arg_max_or_min_no_count, 12 binary_from_function, 13 date_add_interval_sql, 14 datestrtodate_sql, 15 build_formatted_time, 16 filter_array_using_unnest, 17 if_sql, 18 inline_array_unless_query, 19 max_or_greatest, 20 min_or_least, 21 no_ilike_sql, 22 build_date_delta_with_interval, 23 regexp_replace_sql, 24 rename_func, 25 sha256_sql, 26 timestrtotime_sql, 27 ts_or_ds_add_cast, 28 unit_to_var, 29) 30from sqlglot.helper import seq_get, split_num_words 31from sqlglot.tokens import TokenType 32from sqlglot.generator import unsupported_args 33 34if t.TYPE_CHECKING: 35 from sqlglot._typing import E, Lit 36 37 from sqlglot.optimizer.annotate_types import TypeAnnotator 38 39logger = logging.getLogger("sqlglot") 40 41 42def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 43 if not expression.find_ancestor(exp.From, exp.Join): 44 return self.values_sql(expression) 45 46 structs = [] 47 alias = expression.args.get("alias") 48 for tup in expression.find_all(exp.Tuple): 49 field_aliases = ( 50 alias.columns 51 if alias and alias.columns 52 else (f"_c{i}" for i in range(len(tup.expressions))) 53 ) 54 expressions = [ 55 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 56 for name, fld in zip(field_aliases, tup.expressions) 57 ] 58 structs.append(exp.Struct(expressions=expressions)) 59 60 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 61 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 62 return self.unnest_sql( 63 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 64 ) 65 66 67def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 68 this = expression.this 69 if isinstance(this, exp.Schema): 70 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 71 else: 72 this = self.sql(this) 73 return f"RETURNS {this}" 74 75 76def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 77 returns = expression.find(exp.ReturnsProperty) 78 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 79 expression.set("kind", "TABLE FUNCTION") 80 81 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 82 expression.set("expression", expression.expression.this) 83 84 return self.create_sql(expression) 85 86 87# https://issuetracker.google.com/issues/162294746 88# workaround for bigquery bug when grouping by an expression and then ordering 89# WITH x AS (SELECT 1 y) 90# SELECT y + 1 z 91# FROM x 92# GROUP BY x + 1 93# ORDER by z 94def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 95 if isinstance(expression, exp.Select): 96 group = expression.args.get("group") 97 order = expression.args.get("order") 98 99 if group and order: 100 aliases = { 101 select.this: select.args["alias"] 102 for select in expression.selects 103 if isinstance(select, exp.Alias) 104 } 105 106 for grouped in group.expressions: 107 if grouped.is_int: 108 continue 109 alias = aliases.get(grouped) 110 if alias: 111 grouped.replace(exp.column(alias)) 112 113 return expression 114 115 116def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 117 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 118 if isinstance(expression, exp.CTE) and expression.alias_column_names: 119 cte_query = expression.this 120 121 if cte_query.is_star: 122 logger.warning( 123 "Can't push down CTE column names for star queries. Run the query through" 124 " the optimizer or use 'qualify' to expand the star projections first." 125 ) 126 return expression 127 128 column_names = expression.alias_column_names 129 expression.args["alias"].set("columns", None) 130 131 for name, select in zip(column_names, cte_query.selects): 132 to_replace = select 133 134 if isinstance(select, exp.Alias): 135 select = select.this 136 137 # Inner aliases are shadowed by the CTE column names 138 to_replace.replace(exp.alias_(select, name)) 139 140 return expression 141 142 143def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 144 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 145 this.set("zone", seq_get(args, 2)) 146 return this 147 148 149def _build_timestamp(args: t.List) -> exp.Timestamp: 150 timestamp = exp.Timestamp.from_arg_list(args) 151 timestamp.set("with_tz", True) 152 return timestamp 153 154 155def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 156 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 157 return expr_type.from_arg_list(args) 158 159 160def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 161 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 162 arg = seq_get(args, 0) 163 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 164 165 166def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 167 return self.sql( 168 exp.Exists( 169 this=exp.select("1") 170 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 171 .where(exp.column("_col").eq(expression.right)) 172 ) 173 ) 174 175 176def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 177 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 178 179 180def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 181 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 182 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 183 unit = unit_to_var(expression) 184 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 185 186 187def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 188 scale = expression.args.get("scale") 189 timestamp = expression.this 190 191 if scale in (None, exp.UnixToTime.SECONDS): 192 return self.func("TIMESTAMP_SECONDS", timestamp) 193 if scale == exp.UnixToTime.MILLIS: 194 return self.func("TIMESTAMP_MILLIS", timestamp) 195 if scale == exp.UnixToTime.MICROS: 196 return self.func("TIMESTAMP_MICROS", timestamp) 197 198 unix_seconds = exp.cast( 199 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 200 ) 201 return self.func("TIMESTAMP_SECONDS", unix_seconds) 202 203 204def _build_time(args: t.List) -> exp.Func: 205 if len(args) == 1: 206 return exp.TsOrDsToTime(this=args[0]) 207 if len(args) == 2: 208 return exp.Time.from_arg_list(args) 209 return exp.TimeFromParts.from_arg_list(args) 210 211 212def _build_datetime(args: t.List) -> exp.Func: 213 if len(args) == 1: 214 return exp.TsOrDsToTimestamp.from_arg_list(args) 215 if len(args) == 2: 216 return exp.Datetime.from_arg_list(args) 217 return exp.TimestampFromParts.from_arg_list(args) 218 219 220def _build_regexp_extract( 221 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 222) -> t.Callable[[t.List], E]: 223 def _builder(args: t.List) -> E: 224 try: 225 group = re.compile(args[1].name).groups == 1 226 except re.error: 227 group = False 228 229 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 230 return expr_type( 231 this=seq_get(args, 0), 232 expression=seq_get(args, 1), 233 position=seq_get(args, 2), 234 occurrence=seq_get(args, 3), 235 group=exp.Literal.number(1) if group else default_group, 236 ) 237 238 return _builder 239 240 241def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 242 def _builder(args: t.List, dialect: Dialect) -> E: 243 if len(args) == 1: 244 # The default value for the JSONPath is '$' i.e all of the data 245 args.append(exp.Literal.string("$")) 246 return parser.build_extract_json_with_path(expr_type)(args, dialect) 247 248 return _builder 249 250 251def _str_to_datetime_sql( 252 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 253) -> str: 254 this = self.sql(expression, "this") 255 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 256 257 if expression.args.get("safe"): 258 fmt = self.format_time( 259 expression, 260 self.dialect.INVERSE_FORMAT_MAPPING, 261 self.dialect.INVERSE_FORMAT_TRIE, 262 ) 263 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 264 265 fmt = self.format_time(expression) 266 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 267 268 269def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 270 """ 271 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 272 +---------+---------+---------+------------+---------+ 273 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 274 +---------+---------+---------+------------+---------+ 275 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 276 +---------+---------+---------+------------+---------+ 277 """ 278 self._annotate_args(expression) 279 280 this: exp.Expression = expression.this 281 282 self._set_type( 283 expression, 284 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 285 ) 286 return expression 287 288 289@unsupported_args("ins_cost", "del_cost", "sub_cost") 290def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 291 max_dist = expression.args.get("max_dist") 292 if max_dist: 293 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 294 295 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 296 297 298def _build_levenshtein(args: t.List) -> exp.Levenshtein: 299 max_dist = seq_get(args, 2) 300 return exp.Levenshtein( 301 this=seq_get(args, 0), 302 expression=seq_get(args, 1), 303 max_dist=max_dist.expression if max_dist else None, 304 ) 305 306 307class BigQuery(Dialect): 308 WEEK_OFFSET = -1 309 UNNEST_COLUMN_ONLY = True 310 SUPPORTS_USER_DEFINED_TYPES = False 311 SUPPORTS_SEMI_ANTI_JOIN = False 312 LOG_BASE_FIRST = False 313 HEX_LOWERCASE = True 314 FORCE_EARLY_ALIAS_REF_EXPANSION = True 315 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 316 317 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 318 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 319 320 # bigquery udfs are case sensitive 321 NORMALIZE_FUNCTIONS = False 322 323 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 324 TIME_MAPPING = { 325 "%D": "%m/%d/%y", 326 "%E6S": "%S.%f", 327 "%e": "%-d", 328 } 329 330 FORMAT_MAPPING = { 331 "DD": "%d", 332 "MM": "%m", 333 "MON": "%b", 334 "MONTH": "%B", 335 "YYYY": "%Y", 336 "YY": "%y", 337 "HH": "%I", 338 "HH12": "%I", 339 "HH24": "%H", 340 "MI": "%M", 341 "SS": "%S", 342 "SSSSS": "%f", 343 "TZH": "%z", 344 } 345 346 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 347 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 348 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 349 350 # All set operations require either a DISTINCT or ALL specifier 351 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 352 353 ANNOTATORS = { 354 **Dialect.ANNOTATORS, 355 **{ 356 expr_type: lambda self, e: _annotate_math_functions(self, e) 357 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 358 }, 359 **{ 360 expr_type: lambda self, e: self._annotate_by_args(e, "this") 361 for expr_type in ( 362 exp.Left, 363 exp.Right, 364 exp.Lower, 365 exp.Upper, 366 exp.Pad, 367 exp.Trim, 368 exp.RegexpExtract, 369 exp.RegexpReplace, 370 exp.Repeat, 371 exp.Substring, 372 ) 373 }, 374 exp.Concat: lambda self, e: self._annotate_by_args(e, "expressions"), 375 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 376 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 377 } 378 379 def normalize_identifier(self, expression: E) -> E: 380 if ( 381 isinstance(expression, exp.Identifier) 382 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 383 ): 384 parent = expression.parent 385 while isinstance(parent, exp.Dot): 386 parent = parent.parent 387 388 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 389 # by default. The following check uses a heuristic to detect tables based on whether 390 # they are qualified. This should generally be correct, because tables in BigQuery 391 # must be qualified with at least a dataset, unless @@dataset_id is set. 392 case_sensitive = ( 393 isinstance(parent, exp.UserDefinedFunction) 394 or ( 395 isinstance(parent, exp.Table) 396 and parent.db 397 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 398 ) 399 or expression.meta.get("is_table") 400 ) 401 if not case_sensitive: 402 expression.set("this", expression.this.lower()) 403 404 return expression 405 406 class Tokenizer(tokens.Tokenizer): 407 QUOTES = ["'", '"', '"""', "'''"] 408 COMMENTS = ["--", "#", ("/*", "*/")] 409 IDENTIFIERS = ["`"] 410 STRING_ESCAPES = ["\\"] 411 412 HEX_STRINGS = [("0x", ""), ("0X", "")] 413 414 BYTE_STRINGS = [ 415 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 416 ] 417 418 RAW_STRINGS = [ 419 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 420 ] 421 422 KEYWORDS = { 423 **tokens.Tokenizer.KEYWORDS, 424 "ANY TYPE": TokenType.VARIANT, 425 "BEGIN": TokenType.COMMAND, 426 "BEGIN TRANSACTION": TokenType.BEGIN, 427 "BYTEINT": TokenType.INT, 428 "BYTES": TokenType.BINARY, 429 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 430 "DATETIME": TokenType.TIMESTAMP, 431 "DECLARE": TokenType.COMMAND, 432 "ELSEIF": TokenType.COMMAND, 433 "EXCEPTION": TokenType.COMMAND, 434 "FLOAT64": TokenType.DOUBLE, 435 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 436 "MODEL": TokenType.MODEL, 437 "NOT DETERMINISTIC": TokenType.VOLATILE, 438 "RECORD": TokenType.STRUCT, 439 "TIMESTAMP": TokenType.TIMESTAMPTZ, 440 } 441 KEYWORDS.pop("DIV") 442 KEYWORDS.pop("VALUES") 443 KEYWORDS.pop("/*+") 444 445 class Parser(parser.Parser): 446 PREFIXED_PIVOT_COLUMNS = True 447 LOG_DEFAULTS_TO_LN = True 448 SUPPORTS_IMPLICIT_UNNEST = True 449 450 FUNCTIONS = { 451 **parser.Parser.FUNCTIONS, 452 "DATE": _build_date, 453 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 454 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 455 "DATE_TRUNC": lambda args: exp.DateTrunc( 456 unit=exp.Literal.string(str(seq_get(args, 1))), 457 this=seq_get(args, 0), 458 zone=seq_get(args, 2), 459 ), 460 "DATETIME": _build_datetime, 461 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 462 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 463 "DIV": binary_from_function(exp.IntDiv), 464 "EDIT_DISTANCE": _build_levenshtein, 465 "FORMAT_DATE": lambda args: exp.TimeToStr( 466 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 467 ), 468 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 469 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 470 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 471 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 472 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 473 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 474 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 475 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 476 "MD5": exp.MD5Digest.from_arg_list, 477 "TO_HEX": _build_to_hex, 478 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 479 [seq_get(args, 1), seq_get(args, 0)] 480 ), 481 "PARSE_TIMESTAMP": _build_parse_timestamp, 482 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 483 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 484 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 485 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 486 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 487 ), 488 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 489 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 490 "SPLIT": lambda args: exp.Split( 491 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 492 this=seq_get(args, 0), 493 expression=seq_get(args, 1) or exp.Literal.string(","), 494 ), 495 "TIME": _build_time, 496 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 497 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 498 "TIMESTAMP": _build_timestamp, 499 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 500 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 501 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 502 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 503 ), 504 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 505 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 506 ), 507 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 508 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 509 "FORMAT_DATETIME": lambda args: exp.TimeToStr( 510 this=exp.TsOrDsToTimestamp(this=seq_get(args, 1)), format=seq_get(args, 0) 511 ), 512 } 513 514 FUNCTION_PARSERS = { 515 **parser.Parser.FUNCTION_PARSERS, 516 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 517 } 518 FUNCTION_PARSERS.pop("TRIM") 519 520 NO_PAREN_FUNCTIONS = { 521 **parser.Parser.NO_PAREN_FUNCTIONS, 522 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 523 } 524 525 NESTED_TYPE_TOKENS = { 526 *parser.Parser.NESTED_TYPE_TOKENS, 527 TokenType.TABLE, 528 } 529 530 PROPERTY_PARSERS = { 531 **parser.Parser.PROPERTY_PARSERS, 532 "NOT DETERMINISTIC": lambda self: self.expression( 533 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 534 ), 535 "OPTIONS": lambda self: self._parse_with_property(), 536 } 537 538 CONSTRAINT_PARSERS = { 539 **parser.Parser.CONSTRAINT_PARSERS, 540 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 541 } 542 543 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 544 RANGE_PARSERS.pop(TokenType.OVERLAPS) 545 546 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 547 548 STATEMENT_PARSERS = { 549 **parser.Parser.STATEMENT_PARSERS, 550 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 551 TokenType.END: lambda self: self._parse_as_command(self._prev), 552 TokenType.FOR: lambda self: self._parse_for_in(), 553 } 554 555 BRACKET_OFFSETS = { 556 "OFFSET": (0, False), 557 "ORDINAL": (1, False), 558 "SAFE_OFFSET": (0, True), 559 "SAFE_ORDINAL": (1, True), 560 } 561 562 def _parse_for_in(self) -> exp.ForIn: 563 this = self._parse_range() 564 self._match_text_seq("DO") 565 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 566 567 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 568 this = super()._parse_table_part(schema=schema) or self._parse_number() 569 570 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 571 if isinstance(this, exp.Identifier): 572 table_name = this.name 573 while self._match(TokenType.DASH, advance=False) and self._next: 574 text = "" 575 while self._is_connected() and self._curr.token_type != TokenType.DOT: 576 self._advance() 577 text += self._prev.text 578 table_name += text 579 580 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 581 elif isinstance(this, exp.Literal): 582 table_name = this.name 583 584 if self._is_connected() and self._parse_var(any_token=True): 585 table_name += self._prev.text 586 587 this = exp.Identifier(this=table_name, quoted=True) 588 589 return this 590 591 def _parse_table_parts( 592 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 593 ) -> exp.Table: 594 table = super()._parse_table_parts( 595 schema=schema, is_db_reference=is_db_reference, wildcard=True 596 ) 597 598 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 599 if not table.catalog: 600 if table.db: 601 parts = table.db.split(".") 602 if len(parts) == 2 and not table.args["db"].quoted: 603 table.set("catalog", exp.Identifier(this=parts[0])) 604 table.set("db", exp.Identifier(this=parts[1])) 605 else: 606 parts = table.name.split(".") 607 if len(parts) == 2 and not table.this.quoted: 608 table.set("db", exp.Identifier(this=parts[0])) 609 table.set("this", exp.Identifier(this=parts[1])) 610 611 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 612 alias = table.this 613 catalog, db, this, *rest = ( 614 exp.to_identifier(p, quoted=True) 615 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 616 ) 617 618 if rest and this: 619 this = exp.Dot.build([this, *rest]) # type: ignore 620 621 table = exp.Table( 622 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 623 ) 624 table.meta["quoted_table"] = True 625 else: 626 alias = None 627 628 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 629 # dataset, so if the project identifier is omitted we need to fix the ast so that 630 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 631 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 632 # views, because it would seem like the "catalog" part is set, when it'd actually 633 # be the region/dataset. Merging the two identifiers into a single one is done to 634 # avoid producing a 4-part Table reference, which would cause issues in the schema 635 # module, when there are 3-part table names mixed with information schema views. 636 # 637 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 638 table_parts = table.parts 639 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 640 # We need to alias the table here to avoid breaking existing qualified columns. 641 # This is expected to be safe, because if there's an actual alias coming up in 642 # the token stream, it will overwrite this one. If there isn't one, we are only 643 # exposing the name that can be used to reference the view explicitly (a no-op). 644 exp.alias_( 645 table, 646 t.cast(exp.Identifier, alias or table_parts[-1]), 647 table=True, 648 copy=False, 649 ) 650 651 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 652 table.set("this", exp.Identifier(this=info_schema_view, quoted=True)) 653 table.set("db", seq_get(table_parts, -3)) 654 table.set("catalog", seq_get(table_parts, -4)) 655 656 return table 657 658 def _parse_column(self) -> t.Optional[exp.Expression]: 659 column = super()._parse_column() 660 if isinstance(column, exp.Column): 661 parts = column.parts 662 if any("." in p.name for p in parts): 663 catalog, db, table, this, *rest = ( 664 exp.to_identifier(p, quoted=True) 665 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 666 ) 667 668 if rest and this: 669 this = exp.Dot.build([this, *rest]) # type: ignore 670 671 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 672 column.meta["quoted_column"] = True 673 674 return column 675 676 @t.overload 677 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 678 679 @t.overload 680 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 681 682 def _parse_json_object(self, agg=False): 683 json_object = super()._parse_json_object() 684 array_kv_pair = seq_get(json_object.expressions, 0) 685 686 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 687 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 688 if ( 689 array_kv_pair 690 and isinstance(array_kv_pair.this, exp.Array) 691 and isinstance(array_kv_pair.expression, exp.Array) 692 ): 693 keys = array_kv_pair.this.expressions 694 values = array_kv_pair.expression.expressions 695 696 json_object.set( 697 "expressions", 698 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 699 ) 700 701 return json_object 702 703 def _parse_bracket( 704 self, this: t.Optional[exp.Expression] = None 705 ) -> t.Optional[exp.Expression]: 706 bracket = super()._parse_bracket(this) 707 708 if this is bracket: 709 return bracket 710 711 if isinstance(bracket, exp.Bracket): 712 for expression in bracket.expressions: 713 name = expression.name.upper() 714 715 if name not in self.BRACKET_OFFSETS: 716 break 717 718 offset, safe = self.BRACKET_OFFSETS[name] 719 bracket.set("offset", offset) 720 bracket.set("safe", safe) 721 expression.replace(expression.expressions[0]) 722 723 return bracket 724 725 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 726 unnest = super()._parse_unnest(with_alias=with_alias) 727 728 if not unnest: 729 return None 730 731 unnest_expr = seq_get(unnest.expressions, 0) 732 if unnest_expr: 733 from sqlglot.optimizer.annotate_types import annotate_types 734 735 unnest_expr = annotate_types(unnest_expr) 736 737 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 738 # in contrast to other dialects such as DuckDB which flattens only the array by default 739 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 740 array_elem.is_type(exp.DataType.Type.STRUCT) 741 for array_elem in unnest_expr._type.expressions 742 ): 743 unnest.set("explode_array", True) 744 745 return unnest 746 747 class Generator(generator.Generator): 748 INTERVAL_ALLOWS_PLURAL_FORM = False 749 JOIN_HINTS = False 750 QUERY_HINTS = False 751 TABLE_HINTS = False 752 LIMIT_FETCH = "LIMIT" 753 RENAME_TABLE_WITH_DB = False 754 NVL2_SUPPORTED = False 755 UNNEST_WITH_ORDINALITY = False 756 COLLATE_IS_FUNC = True 757 LIMIT_ONLY_LITERALS = True 758 SUPPORTS_TABLE_ALIAS_COLUMNS = False 759 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 760 JSON_KEY_VALUE_PAIR_SEP = "," 761 NULL_ORDERING_SUPPORTED = False 762 IGNORE_NULLS_IN_FUNC = True 763 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 764 CAN_IMPLEMENT_ARRAY_ANY = True 765 SUPPORTS_TO_NUMBER = False 766 NAMED_PLACEHOLDER_TOKEN = "@" 767 HEX_FUNC = "TO_HEX" 768 WITH_PROPERTIES_PREFIX = "OPTIONS" 769 SUPPORTS_EXPLODING_PROJECTIONS = False 770 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 771 SUPPORTS_UNIX_SECONDS = True 772 773 TRANSFORMS = { 774 **generator.Generator.TRANSFORMS, 775 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 776 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 777 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 778 exp.Array: inline_array_unless_query, 779 exp.ArrayContains: _array_contains_sql, 780 exp.ArrayFilter: filter_array_using_unnest, 781 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 782 exp.CollateProperty: lambda self, e: ( 783 f"DEFAULT COLLATE {self.sql(e, 'this')}" 784 if e.args.get("default") 785 else f"COLLATE {self.sql(e, 'this')}" 786 ), 787 exp.Commit: lambda *_: "COMMIT TRANSACTION", 788 exp.CountIf: rename_func("COUNTIF"), 789 exp.Create: _create_sql, 790 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 791 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 792 exp.DateDiff: lambda self, e: self.func( 793 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 794 ), 795 exp.DateFromParts: rename_func("DATE"), 796 exp.DateStrToDate: datestrtodate_sql, 797 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 798 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 799 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 800 exp.DateTrunc: lambda self, e: self.func( 801 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 802 ), 803 exp.FromTimeZone: lambda self, e: self.func( 804 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 805 ), 806 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 807 exp.GroupConcat: rename_func("STRING_AGG"), 808 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 809 exp.If: if_sql(false_value="NULL"), 810 exp.ILike: no_ilike_sql, 811 exp.IntDiv: rename_func("DIV"), 812 exp.JSONFormat: rename_func("TO_JSON_STRING"), 813 exp.Levenshtein: _levenshtein_sql, 814 exp.Max: max_or_greatest, 815 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 816 exp.MD5Digest: rename_func("MD5"), 817 exp.Min: min_or_least, 818 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 819 exp.RegexpExtract: lambda self, e: self.func( 820 "REGEXP_EXTRACT", 821 e.this, 822 e.expression, 823 e.args.get("position"), 824 e.args.get("occurrence"), 825 ), 826 exp.RegexpExtractAll: lambda self, e: self.func( 827 "REGEXP_EXTRACT_ALL", e.this, e.expression 828 ), 829 exp.RegexpReplace: regexp_replace_sql, 830 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 831 exp.ReturnsProperty: _returnsproperty_sql, 832 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 833 exp.Select: transforms.preprocess( 834 [ 835 transforms.explode_to_unnest(), 836 transforms.unqualify_unnest, 837 transforms.eliminate_distinct_on, 838 _alias_ordered_group, 839 transforms.eliminate_semi_and_anti_joins, 840 ] 841 ), 842 exp.SHA: rename_func("SHA1"), 843 exp.SHA2: sha256_sql, 844 exp.StabilityProperty: lambda self, e: ( 845 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 846 ), 847 exp.String: rename_func("STRING"), 848 exp.StrToDate: _str_to_datetime_sql, 849 exp.StrToTime: _str_to_datetime_sql, 850 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 851 exp.TimeFromParts: rename_func("TIME"), 852 exp.TimestampFromParts: rename_func("DATETIME"), 853 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 854 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 855 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 856 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 857 exp.TimeStrToTime: timestrtotime_sql, 858 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 859 exp.TsOrDsAdd: _ts_or_ds_add_sql, 860 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 861 exp.TsOrDsToTime: rename_func("TIME"), 862 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 863 exp.Unhex: rename_func("FROM_HEX"), 864 exp.UnixDate: rename_func("UNIX_DATE"), 865 exp.UnixToTime: _unix_to_time_sql, 866 exp.Uuid: lambda *_: "GENERATE_UUID()", 867 exp.Values: _derived_table_values_to_unnest, 868 exp.VariancePop: rename_func("VAR_POP"), 869 } 870 871 SUPPORTED_JSON_PATH_PARTS = { 872 exp.JSONPathKey, 873 exp.JSONPathRoot, 874 exp.JSONPathSubscript, 875 } 876 877 TYPE_MAPPING = { 878 **generator.Generator.TYPE_MAPPING, 879 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 880 exp.DataType.Type.BIGINT: "INT64", 881 exp.DataType.Type.BINARY: "BYTES", 882 exp.DataType.Type.BOOLEAN: "BOOL", 883 exp.DataType.Type.CHAR: "STRING", 884 exp.DataType.Type.DECIMAL: "NUMERIC", 885 exp.DataType.Type.DOUBLE: "FLOAT64", 886 exp.DataType.Type.FLOAT: "FLOAT64", 887 exp.DataType.Type.INT: "INT64", 888 exp.DataType.Type.NCHAR: "STRING", 889 exp.DataType.Type.NVARCHAR: "STRING", 890 exp.DataType.Type.SMALLINT: "INT64", 891 exp.DataType.Type.TEXT: "STRING", 892 exp.DataType.Type.TIMESTAMP: "DATETIME", 893 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 894 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 895 exp.DataType.Type.TINYINT: "INT64", 896 exp.DataType.Type.ROWVERSION: "BYTES", 897 exp.DataType.Type.UUID: "STRING", 898 exp.DataType.Type.VARBINARY: "BYTES", 899 exp.DataType.Type.VARCHAR: "STRING", 900 exp.DataType.Type.VARIANT: "ANY TYPE", 901 } 902 903 PROPERTIES_LOCATION = { 904 **generator.Generator.PROPERTIES_LOCATION, 905 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 906 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 907 } 908 909 # WINDOW comes after QUALIFY 910 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 911 AFTER_HAVING_MODIFIER_TRANSFORMS = { 912 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 913 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 914 } 915 916 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 917 RESERVED_KEYWORDS = { 918 "all", 919 "and", 920 "any", 921 "array", 922 "as", 923 "asc", 924 "assert_rows_modified", 925 "at", 926 "between", 927 "by", 928 "case", 929 "cast", 930 "collate", 931 "contains", 932 "create", 933 "cross", 934 "cube", 935 "current", 936 "default", 937 "define", 938 "desc", 939 "distinct", 940 "else", 941 "end", 942 "enum", 943 "escape", 944 "except", 945 "exclude", 946 "exists", 947 "extract", 948 "false", 949 "fetch", 950 "following", 951 "for", 952 "from", 953 "full", 954 "group", 955 "grouping", 956 "groups", 957 "hash", 958 "having", 959 "if", 960 "ignore", 961 "in", 962 "inner", 963 "intersect", 964 "interval", 965 "into", 966 "is", 967 "join", 968 "lateral", 969 "left", 970 "like", 971 "limit", 972 "lookup", 973 "merge", 974 "natural", 975 "new", 976 "no", 977 "not", 978 "null", 979 "nulls", 980 "of", 981 "on", 982 "or", 983 "order", 984 "outer", 985 "over", 986 "partition", 987 "preceding", 988 "proto", 989 "qualify", 990 "range", 991 "recursive", 992 "respect", 993 "right", 994 "rollup", 995 "rows", 996 "select", 997 "set", 998 "some", 999 "struct", 1000 "tablesample", 1001 "then", 1002 "to", 1003 "treat", 1004 "true", 1005 "unbounded", 1006 "union", 1007 "unnest", 1008 "using", 1009 "when", 1010 "where", 1011 "window", 1012 "with", 1013 "within", 1014 } 1015 1016 def mod_sql(self, expression: exp.Mod) -> str: 1017 this = expression.this 1018 expr = expression.expression 1019 return self.func( 1020 "MOD", 1021 this.unnest() if isinstance(this, exp.Paren) else this, 1022 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1023 ) 1024 1025 def column_parts(self, expression: exp.Column) -> str: 1026 if expression.meta.get("quoted_column"): 1027 # If a column reference is of the form `dataset.table`.name, we need 1028 # to preserve the quoted table path, otherwise the reference breaks 1029 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1030 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1031 return f"{table_path}.{self.sql(expression, 'this')}" 1032 1033 return super().column_parts(expression) 1034 1035 def table_parts(self, expression: exp.Table) -> str: 1036 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1037 # we need to make sure the correct quoting is used in each case. 1038 # 1039 # For example, if there is a CTE x that clashes with a schema name, then the former will 1040 # return the table y in that schema, whereas the latter will return the CTE's y column: 1041 # 1042 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1043 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1044 if expression.meta.get("quoted_table"): 1045 table_parts = ".".join(p.name for p in expression.parts) 1046 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1047 1048 return super().table_parts(expression) 1049 1050 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1051 if isinstance(expression.this, exp.TsOrDsToTimestamp): 1052 func_name = "FORMAT_DATETIME" 1053 else: 1054 func_name = "FORMAT_DATE" 1055 this = ( 1056 expression.this 1057 if isinstance(expression.this, (exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1058 else expression 1059 ) 1060 return self.func(func_name, self.format_time(expression), this.this) 1061 1062 def eq_sql(self, expression: exp.EQ) -> str: 1063 # Operands of = cannot be NULL in BigQuery 1064 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1065 if not isinstance(expression.parent, exp.Update): 1066 return "NULL" 1067 1068 return self.binary(expression, "=") 1069 1070 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1071 parent = expression.parent 1072 1073 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1074 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1075 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1076 return self.func( 1077 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1078 ) 1079 1080 return super().attimezone_sql(expression) 1081 1082 def trycast_sql(self, expression: exp.TryCast) -> str: 1083 return self.cast_sql(expression, safe_prefix="SAFE_") 1084 1085 def bracket_sql(self, expression: exp.Bracket) -> str: 1086 this = expression.this 1087 expressions = expression.expressions 1088 1089 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1090 arg = expressions[0] 1091 if arg.type is None: 1092 from sqlglot.optimizer.annotate_types import annotate_types 1093 1094 arg = annotate_types(arg) 1095 1096 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1097 # BQ doesn't support bracket syntax with string values for structs 1098 return f"{self.sql(this)}.{arg.name}" 1099 1100 expressions_sql = self.expressions(expression, flat=True) 1101 offset = expression.args.get("offset") 1102 1103 if offset == 0: 1104 expressions_sql = f"OFFSET({expressions_sql})" 1105 elif offset == 1: 1106 expressions_sql = f"ORDINAL({expressions_sql})" 1107 elif offset is not None: 1108 self.unsupported(f"Unsupported array offset: {offset}") 1109 1110 if expression.args.get("safe"): 1111 expressions_sql = f"SAFE_{expressions_sql}" 1112 1113 return f"{self.sql(this)}[{expressions_sql}]" 1114 1115 def in_unnest_op(self, expression: exp.Unnest) -> str: 1116 return self.sql(expression) 1117 1118 def version_sql(self, expression: exp.Version) -> str: 1119 if expression.name == "TIMESTAMP": 1120 expression.set("this", "SYSTEM_TIME") 1121 return super().version_sql(expression)
308class BigQuery(Dialect): 309 WEEK_OFFSET = -1 310 UNNEST_COLUMN_ONLY = True 311 SUPPORTS_USER_DEFINED_TYPES = False 312 SUPPORTS_SEMI_ANTI_JOIN = False 313 LOG_BASE_FIRST = False 314 HEX_LOWERCASE = True 315 FORCE_EARLY_ALIAS_REF_EXPANSION = True 316 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 317 318 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 319 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 320 321 # bigquery udfs are case sensitive 322 NORMALIZE_FUNCTIONS = False 323 324 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 325 TIME_MAPPING = { 326 "%D": "%m/%d/%y", 327 "%E6S": "%S.%f", 328 "%e": "%-d", 329 } 330 331 FORMAT_MAPPING = { 332 "DD": "%d", 333 "MM": "%m", 334 "MON": "%b", 335 "MONTH": "%B", 336 "YYYY": "%Y", 337 "YY": "%y", 338 "HH": "%I", 339 "HH12": "%I", 340 "HH24": "%H", 341 "MI": "%M", 342 "SS": "%S", 343 "SSSSS": "%f", 344 "TZH": "%z", 345 } 346 347 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 348 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 349 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 350 351 # All set operations require either a DISTINCT or ALL specifier 352 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 353 354 ANNOTATORS = { 355 **Dialect.ANNOTATORS, 356 **{ 357 expr_type: lambda self, e: _annotate_math_functions(self, e) 358 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 359 }, 360 **{ 361 expr_type: lambda self, e: self._annotate_by_args(e, "this") 362 for expr_type in ( 363 exp.Left, 364 exp.Right, 365 exp.Lower, 366 exp.Upper, 367 exp.Pad, 368 exp.Trim, 369 exp.RegexpExtract, 370 exp.RegexpReplace, 371 exp.Repeat, 372 exp.Substring, 373 ) 374 }, 375 exp.Concat: lambda self, e: self._annotate_by_args(e, "expressions"), 376 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 377 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 378 } 379 380 def normalize_identifier(self, expression: E) -> E: 381 if ( 382 isinstance(expression, exp.Identifier) 383 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 384 ): 385 parent = expression.parent 386 while isinstance(parent, exp.Dot): 387 parent = parent.parent 388 389 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 390 # by default. The following check uses a heuristic to detect tables based on whether 391 # they are qualified. This should generally be correct, because tables in BigQuery 392 # must be qualified with at least a dataset, unless @@dataset_id is set. 393 case_sensitive = ( 394 isinstance(parent, exp.UserDefinedFunction) 395 or ( 396 isinstance(parent, exp.Table) 397 and parent.db 398 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 399 ) 400 or expression.meta.get("is_table") 401 ) 402 if not case_sensitive: 403 expression.set("this", expression.this.lower()) 404 405 return expression 406 407 class Tokenizer(tokens.Tokenizer): 408 QUOTES = ["'", '"', '"""', "'''"] 409 COMMENTS = ["--", "#", ("/*", "*/")] 410 IDENTIFIERS = ["`"] 411 STRING_ESCAPES = ["\\"] 412 413 HEX_STRINGS = [("0x", ""), ("0X", "")] 414 415 BYTE_STRINGS = [ 416 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 417 ] 418 419 RAW_STRINGS = [ 420 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 421 ] 422 423 KEYWORDS = { 424 **tokens.Tokenizer.KEYWORDS, 425 "ANY TYPE": TokenType.VARIANT, 426 "BEGIN": TokenType.COMMAND, 427 "BEGIN TRANSACTION": TokenType.BEGIN, 428 "BYTEINT": TokenType.INT, 429 "BYTES": TokenType.BINARY, 430 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 431 "DATETIME": TokenType.TIMESTAMP, 432 "DECLARE": TokenType.COMMAND, 433 "ELSEIF": TokenType.COMMAND, 434 "EXCEPTION": TokenType.COMMAND, 435 "FLOAT64": TokenType.DOUBLE, 436 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 437 "MODEL": TokenType.MODEL, 438 "NOT DETERMINISTIC": TokenType.VOLATILE, 439 "RECORD": TokenType.STRUCT, 440 "TIMESTAMP": TokenType.TIMESTAMPTZ, 441 } 442 KEYWORDS.pop("DIV") 443 KEYWORDS.pop("VALUES") 444 KEYWORDS.pop("/*+") 445 446 class Parser(parser.Parser): 447 PREFIXED_PIVOT_COLUMNS = True 448 LOG_DEFAULTS_TO_LN = True 449 SUPPORTS_IMPLICIT_UNNEST = True 450 451 FUNCTIONS = { 452 **parser.Parser.FUNCTIONS, 453 "DATE": _build_date, 454 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 455 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 456 "DATE_TRUNC": lambda args: exp.DateTrunc( 457 unit=exp.Literal.string(str(seq_get(args, 1))), 458 this=seq_get(args, 0), 459 zone=seq_get(args, 2), 460 ), 461 "DATETIME": _build_datetime, 462 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 463 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 464 "DIV": binary_from_function(exp.IntDiv), 465 "EDIT_DISTANCE": _build_levenshtein, 466 "FORMAT_DATE": lambda args: exp.TimeToStr( 467 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 468 ), 469 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 470 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 471 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 472 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 473 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 474 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 475 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 476 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 477 "MD5": exp.MD5Digest.from_arg_list, 478 "TO_HEX": _build_to_hex, 479 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 480 [seq_get(args, 1), seq_get(args, 0)] 481 ), 482 "PARSE_TIMESTAMP": _build_parse_timestamp, 483 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 484 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 485 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 486 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 487 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 488 ), 489 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 490 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 491 "SPLIT": lambda args: exp.Split( 492 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 493 this=seq_get(args, 0), 494 expression=seq_get(args, 1) or exp.Literal.string(","), 495 ), 496 "TIME": _build_time, 497 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 498 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 499 "TIMESTAMP": _build_timestamp, 500 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 501 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 502 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 503 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 504 ), 505 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 506 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 507 ), 508 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 509 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 510 "FORMAT_DATETIME": lambda args: exp.TimeToStr( 511 this=exp.TsOrDsToTimestamp(this=seq_get(args, 1)), format=seq_get(args, 0) 512 ), 513 } 514 515 FUNCTION_PARSERS = { 516 **parser.Parser.FUNCTION_PARSERS, 517 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 518 } 519 FUNCTION_PARSERS.pop("TRIM") 520 521 NO_PAREN_FUNCTIONS = { 522 **parser.Parser.NO_PAREN_FUNCTIONS, 523 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 524 } 525 526 NESTED_TYPE_TOKENS = { 527 *parser.Parser.NESTED_TYPE_TOKENS, 528 TokenType.TABLE, 529 } 530 531 PROPERTY_PARSERS = { 532 **parser.Parser.PROPERTY_PARSERS, 533 "NOT DETERMINISTIC": lambda self: self.expression( 534 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 535 ), 536 "OPTIONS": lambda self: self._parse_with_property(), 537 } 538 539 CONSTRAINT_PARSERS = { 540 **parser.Parser.CONSTRAINT_PARSERS, 541 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 542 } 543 544 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 545 RANGE_PARSERS.pop(TokenType.OVERLAPS) 546 547 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 548 549 STATEMENT_PARSERS = { 550 **parser.Parser.STATEMENT_PARSERS, 551 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 552 TokenType.END: lambda self: self._parse_as_command(self._prev), 553 TokenType.FOR: lambda self: self._parse_for_in(), 554 } 555 556 BRACKET_OFFSETS = { 557 "OFFSET": (0, False), 558 "ORDINAL": (1, False), 559 "SAFE_OFFSET": (0, True), 560 "SAFE_ORDINAL": (1, True), 561 } 562 563 def _parse_for_in(self) -> exp.ForIn: 564 this = self._parse_range() 565 self._match_text_seq("DO") 566 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 567 568 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 569 this = super()._parse_table_part(schema=schema) or self._parse_number() 570 571 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 572 if isinstance(this, exp.Identifier): 573 table_name = this.name 574 while self._match(TokenType.DASH, advance=False) and self._next: 575 text = "" 576 while self._is_connected() and self._curr.token_type != TokenType.DOT: 577 self._advance() 578 text += self._prev.text 579 table_name += text 580 581 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 582 elif isinstance(this, exp.Literal): 583 table_name = this.name 584 585 if self._is_connected() and self._parse_var(any_token=True): 586 table_name += self._prev.text 587 588 this = exp.Identifier(this=table_name, quoted=True) 589 590 return this 591 592 def _parse_table_parts( 593 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 594 ) -> exp.Table: 595 table = super()._parse_table_parts( 596 schema=schema, is_db_reference=is_db_reference, wildcard=True 597 ) 598 599 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 600 if not table.catalog: 601 if table.db: 602 parts = table.db.split(".") 603 if len(parts) == 2 and not table.args["db"].quoted: 604 table.set("catalog", exp.Identifier(this=parts[0])) 605 table.set("db", exp.Identifier(this=parts[1])) 606 else: 607 parts = table.name.split(".") 608 if len(parts) == 2 and not table.this.quoted: 609 table.set("db", exp.Identifier(this=parts[0])) 610 table.set("this", exp.Identifier(this=parts[1])) 611 612 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 613 alias = table.this 614 catalog, db, this, *rest = ( 615 exp.to_identifier(p, quoted=True) 616 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 617 ) 618 619 if rest and this: 620 this = exp.Dot.build([this, *rest]) # type: ignore 621 622 table = exp.Table( 623 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 624 ) 625 table.meta["quoted_table"] = True 626 else: 627 alias = None 628 629 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 630 # dataset, so if the project identifier is omitted we need to fix the ast so that 631 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 632 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 633 # views, because it would seem like the "catalog" part is set, when it'd actually 634 # be the region/dataset. Merging the two identifiers into a single one is done to 635 # avoid producing a 4-part Table reference, which would cause issues in the schema 636 # module, when there are 3-part table names mixed with information schema views. 637 # 638 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 639 table_parts = table.parts 640 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 641 # We need to alias the table here to avoid breaking existing qualified columns. 642 # This is expected to be safe, because if there's an actual alias coming up in 643 # the token stream, it will overwrite this one. If there isn't one, we are only 644 # exposing the name that can be used to reference the view explicitly (a no-op). 645 exp.alias_( 646 table, 647 t.cast(exp.Identifier, alias or table_parts[-1]), 648 table=True, 649 copy=False, 650 ) 651 652 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 653 table.set("this", exp.Identifier(this=info_schema_view, quoted=True)) 654 table.set("db", seq_get(table_parts, -3)) 655 table.set("catalog", seq_get(table_parts, -4)) 656 657 return table 658 659 def _parse_column(self) -> t.Optional[exp.Expression]: 660 column = super()._parse_column() 661 if isinstance(column, exp.Column): 662 parts = column.parts 663 if any("." in p.name for p in parts): 664 catalog, db, table, this, *rest = ( 665 exp.to_identifier(p, quoted=True) 666 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 667 ) 668 669 if rest and this: 670 this = exp.Dot.build([this, *rest]) # type: ignore 671 672 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 673 column.meta["quoted_column"] = True 674 675 return column 676 677 @t.overload 678 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 679 680 @t.overload 681 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 682 683 def _parse_json_object(self, agg=False): 684 json_object = super()._parse_json_object() 685 array_kv_pair = seq_get(json_object.expressions, 0) 686 687 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 688 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 689 if ( 690 array_kv_pair 691 and isinstance(array_kv_pair.this, exp.Array) 692 and isinstance(array_kv_pair.expression, exp.Array) 693 ): 694 keys = array_kv_pair.this.expressions 695 values = array_kv_pair.expression.expressions 696 697 json_object.set( 698 "expressions", 699 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 700 ) 701 702 return json_object 703 704 def _parse_bracket( 705 self, this: t.Optional[exp.Expression] = None 706 ) -> t.Optional[exp.Expression]: 707 bracket = super()._parse_bracket(this) 708 709 if this is bracket: 710 return bracket 711 712 if isinstance(bracket, exp.Bracket): 713 for expression in bracket.expressions: 714 name = expression.name.upper() 715 716 if name not in self.BRACKET_OFFSETS: 717 break 718 719 offset, safe = self.BRACKET_OFFSETS[name] 720 bracket.set("offset", offset) 721 bracket.set("safe", safe) 722 expression.replace(expression.expressions[0]) 723 724 return bracket 725 726 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 727 unnest = super()._parse_unnest(with_alias=with_alias) 728 729 if not unnest: 730 return None 731 732 unnest_expr = seq_get(unnest.expressions, 0) 733 if unnest_expr: 734 from sqlglot.optimizer.annotate_types import annotate_types 735 736 unnest_expr = annotate_types(unnest_expr) 737 738 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 739 # in contrast to other dialects such as DuckDB which flattens only the array by default 740 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 741 array_elem.is_type(exp.DataType.Type.STRUCT) 742 for array_elem in unnest_expr._type.expressions 743 ): 744 unnest.set("explode_array", True) 745 746 return unnest 747 748 class Generator(generator.Generator): 749 INTERVAL_ALLOWS_PLURAL_FORM = False 750 JOIN_HINTS = False 751 QUERY_HINTS = False 752 TABLE_HINTS = False 753 LIMIT_FETCH = "LIMIT" 754 RENAME_TABLE_WITH_DB = False 755 NVL2_SUPPORTED = False 756 UNNEST_WITH_ORDINALITY = False 757 COLLATE_IS_FUNC = True 758 LIMIT_ONLY_LITERALS = True 759 SUPPORTS_TABLE_ALIAS_COLUMNS = False 760 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 761 JSON_KEY_VALUE_PAIR_SEP = "," 762 NULL_ORDERING_SUPPORTED = False 763 IGNORE_NULLS_IN_FUNC = True 764 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 765 CAN_IMPLEMENT_ARRAY_ANY = True 766 SUPPORTS_TO_NUMBER = False 767 NAMED_PLACEHOLDER_TOKEN = "@" 768 HEX_FUNC = "TO_HEX" 769 WITH_PROPERTIES_PREFIX = "OPTIONS" 770 SUPPORTS_EXPLODING_PROJECTIONS = False 771 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 772 SUPPORTS_UNIX_SECONDS = True 773 774 TRANSFORMS = { 775 **generator.Generator.TRANSFORMS, 776 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 777 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 778 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 779 exp.Array: inline_array_unless_query, 780 exp.ArrayContains: _array_contains_sql, 781 exp.ArrayFilter: filter_array_using_unnest, 782 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 783 exp.CollateProperty: lambda self, e: ( 784 f"DEFAULT COLLATE {self.sql(e, 'this')}" 785 if e.args.get("default") 786 else f"COLLATE {self.sql(e, 'this')}" 787 ), 788 exp.Commit: lambda *_: "COMMIT TRANSACTION", 789 exp.CountIf: rename_func("COUNTIF"), 790 exp.Create: _create_sql, 791 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 792 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 793 exp.DateDiff: lambda self, e: self.func( 794 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 795 ), 796 exp.DateFromParts: rename_func("DATE"), 797 exp.DateStrToDate: datestrtodate_sql, 798 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 799 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 800 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 801 exp.DateTrunc: lambda self, e: self.func( 802 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 803 ), 804 exp.FromTimeZone: lambda self, e: self.func( 805 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 806 ), 807 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 808 exp.GroupConcat: rename_func("STRING_AGG"), 809 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 810 exp.If: if_sql(false_value="NULL"), 811 exp.ILike: no_ilike_sql, 812 exp.IntDiv: rename_func("DIV"), 813 exp.JSONFormat: rename_func("TO_JSON_STRING"), 814 exp.Levenshtein: _levenshtein_sql, 815 exp.Max: max_or_greatest, 816 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 817 exp.MD5Digest: rename_func("MD5"), 818 exp.Min: min_or_least, 819 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 820 exp.RegexpExtract: lambda self, e: self.func( 821 "REGEXP_EXTRACT", 822 e.this, 823 e.expression, 824 e.args.get("position"), 825 e.args.get("occurrence"), 826 ), 827 exp.RegexpExtractAll: lambda self, e: self.func( 828 "REGEXP_EXTRACT_ALL", e.this, e.expression 829 ), 830 exp.RegexpReplace: regexp_replace_sql, 831 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 832 exp.ReturnsProperty: _returnsproperty_sql, 833 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 834 exp.Select: transforms.preprocess( 835 [ 836 transforms.explode_to_unnest(), 837 transforms.unqualify_unnest, 838 transforms.eliminate_distinct_on, 839 _alias_ordered_group, 840 transforms.eliminate_semi_and_anti_joins, 841 ] 842 ), 843 exp.SHA: rename_func("SHA1"), 844 exp.SHA2: sha256_sql, 845 exp.StabilityProperty: lambda self, e: ( 846 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 847 ), 848 exp.String: rename_func("STRING"), 849 exp.StrToDate: _str_to_datetime_sql, 850 exp.StrToTime: _str_to_datetime_sql, 851 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 852 exp.TimeFromParts: rename_func("TIME"), 853 exp.TimestampFromParts: rename_func("DATETIME"), 854 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 855 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 856 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 857 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 858 exp.TimeStrToTime: timestrtotime_sql, 859 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 860 exp.TsOrDsAdd: _ts_or_ds_add_sql, 861 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 862 exp.TsOrDsToTime: rename_func("TIME"), 863 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 864 exp.Unhex: rename_func("FROM_HEX"), 865 exp.UnixDate: rename_func("UNIX_DATE"), 866 exp.UnixToTime: _unix_to_time_sql, 867 exp.Uuid: lambda *_: "GENERATE_UUID()", 868 exp.Values: _derived_table_values_to_unnest, 869 exp.VariancePop: rename_func("VAR_POP"), 870 } 871 872 SUPPORTED_JSON_PATH_PARTS = { 873 exp.JSONPathKey, 874 exp.JSONPathRoot, 875 exp.JSONPathSubscript, 876 } 877 878 TYPE_MAPPING = { 879 **generator.Generator.TYPE_MAPPING, 880 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 881 exp.DataType.Type.BIGINT: "INT64", 882 exp.DataType.Type.BINARY: "BYTES", 883 exp.DataType.Type.BOOLEAN: "BOOL", 884 exp.DataType.Type.CHAR: "STRING", 885 exp.DataType.Type.DECIMAL: "NUMERIC", 886 exp.DataType.Type.DOUBLE: "FLOAT64", 887 exp.DataType.Type.FLOAT: "FLOAT64", 888 exp.DataType.Type.INT: "INT64", 889 exp.DataType.Type.NCHAR: "STRING", 890 exp.DataType.Type.NVARCHAR: "STRING", 891 exp.DataType.Type.SMALLINT: "INT64", 892 exp.DataType.Type.TEXT: "STRING", 893 exp.DataType.Type.TIMESTAMP: "DATETIME", 894 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 895 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 896 exp.DataType.Type.TINYINT: "INT64", 897 exp.DataType.Type.ROWVERSION: "BYTES", 898 exp.DataType.Type.UUID: "STRING", 899 exp.DataType.Type.VARBINARY: "BYTES", 900 exp.DataType.Type.VARCHAR: "STRING", 901 exp.DataType.Type.VARIANT: "ANY TYPE", 902 } 903 904 PROPERTIES_LOCATION = { 905 **generator.Generator.PROPERTIES_LOCATION, 906 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 907 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 908 } 909 910 # WINDOW comes after QUALIFY 911 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 912 AFTER_HAVING_MODIFIER_TRANSFORMS = { 913 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 914 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 915 } 916 917 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 918 RESERVED_KEYWORDS = { 919 "all", 920 "and", 921 "any", 922 "array", 923 "as", 924 "asc", 925 "assert_rows_modified", 926 "at", 927 "between", 928 "by", 929 "case", 930 "cast", 931 "collate", 932 "contains", 933 "create", 934 "cross", 935 "cube", 936 "current", 937 "default", 938 "define", 939 "desc", 940 "distinct", 941 "else", 942 "end", 943 "enum", 944 "escape", 945 "except", 946 "exclude", 947 "exists", 948 "extract", 949 "false", 950 "fetch", 951 "following", 952 "for", 953 "from", 954 "full", 955 "group", 956 "grouping", 957 "groups", 958 "hash", 959 "having", 960 "if", 961 "ignore", 962 "in", 963 "inner", 964 "intersect", 965 "interval", 966 "into", 967 "is", 968 "join", 969 "lateral", 970 "left", 971 "like", 972 "limit", 973 "lookup", 974 "merge", 975 "natural", 976 "new", 977 "no", 978 "not", 979 "null", 980 "nulls", 981 "of", 982 "on", 983 "or", 984 "order", 985 "outer", 986 "over", 987 "partition", 988 "preceding", 989 "proto", 990 "qualify", 991 "range", 992 "recursive", 993 "respect", 994 "right", 995 "rollup", 996 "rows", 997 "select", 998 "set", 999 "some", 1000 "struct", 1001 "tablesample", 1002 "then", 1003 "to", 1004 "treat", 1005 "true", 1006 "unbounded", 1007 "union", 1008 "unnest", 1009 "using", 1010 "when", 1011 "where", 1012 "window", 1013 "with", 1014 "within", 1015 } 1016 1017 def mod_sql(self, expression: exp.Mod) -> str: 1018 this = expression.this 1019 expr = expression.expression 1020 return self.func( 1021 "MOD", 1022 this.unnest() if isinstance(this, exp.Paren) else this, 1023 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1024 ) 1025 1026 def column_parts(self, expression: exp.Column) -> str: 1027 if expression.meta.get("quoted_column"): 1028 # If a column reference is of the form `dataset.table`.name, we need 1029 # to preserve the quoted table path, otherwise the reference breaks 1030 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1031 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1032 return f"{table_path}.{self.sql(expression, 'this')}" 1033 1034 return super().column_parts(expression) 1035 1036 def table_parts(self, expression: exp.Table) -> str: 1037 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1038 # we need to make sure the correct quoting is used in each case. 1039 # 1040 # For example, if there is a CTE x that clashes with a schema name, then the former will 1041 # return the table y in that schema, whereas the latter will return the CTE's y column: 1042 # 1043 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1044 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1045 if expression.meta.get("quoted_table"): 1046 table_parts = ".".join(p.name for p in expression.parts) 1047 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1048 1049 return super().table_parts(expression) 1050 1051 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1052 if isinstance(expression.this, exp.TsOrDsToTimestamp): 1053 func_name = "FORMAT_DATETIME" 1054 else: 1055 func_name = "FORMAT_DATE" 1056 this = ( 1057 expression.this 1058 if isinstance(expression.this, (exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1059 else expression 1060 ) 1061 return self.func(func_name, self.format_time(expression), this.this) 1062 1063 def eq_sql(self, expression: exp.EQ) -> str: 1064 # Operands of = cannot be NULL in BigQuery 1065 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1066 if not isinstance(expression.parent, exp.Update): 1067 return "NULL" 1068 1069 return self.binary(expression, "=") 1070 1071 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1072 parent = expression.parent 1073 1074 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1075 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1076 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1077 return self.func( 1078 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1079 ) 1080 1081 return super().attimezone_sql(expression) 1082 1083 def trycast_sql(self, expression: exp.TryCast) -> str: 1084 return self.cast_sql(expression, safe_prefix="SAFE_") 1085 1086 def bracket_sql(self, expression: exp.Bracket) -> str: 1087 this = expression.this 1088 expressions = expression.expressions 1089 1090 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1091 arg = expressions[0] 1092 if arg.type is None: 1093 from sqlglot.optimizer.annotate_types import annotate_types 1094 1095 arg = annotate_types(arg) 1096 1097 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1098 # BQ doesn't support bracket syntax with string values for structs 1099 return f"{self.sql(this)}.{arg.name}" 1100 1101 expressions_sql = self.expressions(expression, flat=True) 1102 offset = expression.args.get("offset") 1103 1104 if offset == 0: 1105 expressions_sql = f"OFFSET({expressions_sql})" 1106 elif offset == 1: 1107 expressions_sql = f"ORDINAL({expressions_sql})" 1108 elif offset is not None: 1109 self.unsupported(f"Unsupported array offset: {offset}") 1110 1111 if expression.args.get("safe"): 1112 expressions_sql = f"SAFE_{expressions_sql}" 1113 1114 return f"{self.sql(this)}[{expressions_sql}]" 1115 1116 def in_unnest_op(self, expression: exp.Unnest) -> str: 1117 return self.sql(expression) 1118 1119 def version_sql(self, expression: exp.Version) -> str: 1120 if expression.name == "TIMESTAMP": 1121 expression.set("this", "SYSTEM_TIME") 1122 return super().version_sql(expression)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether alias reference expansion before qualification should only happen for the GROUP BY clause.
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
380 def normalize_identifier(self, expression: E) -> E: 381 if ( 382 isinstance(expression, exp.Identifier) 383 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 384 ): 385 parent = expression.parent 386 while isinstance(parent, exp.Dot): 387 parent = parent.parent 388 389 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 390 # by default. The following check uses a heuristic to detect tables based on whether 391 # they are qualified. This should generally be correct, because tables in BigQuery 392 # must be qualified with at least a dataset, unless @@dataset_id is set. 393 case_sensitive = ( 394 isinstance(parent, exp.UserDefinedFunction) 395 or ( 396 isinstance(parent, exp.Table) 397 and parent.db 398 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 399 ) 400 or expression.meta.get("is_table") 401 ) 402 if not case_sensitive: 403 expression.set("this", expression.this.lower()) 404 405 return expression
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Inherited Members
- sqlglot.dialects.dialect.Dialect
- Dialect
- INDEX_OFFSET
- ALIAS_POST_TABLESAMPLE
- TABLESAMPLE_SIZE_IS_PERCENT
- IDENTIFIERS_CAN_START_WITH_DIGIT
- DPIPE_IS_STRING_CONCAT
- STRICT_STRING_CONCAT
- COPY_PARAMS_ARE_CSV
- NULL_ORDERING
- TYPED_DIVISION
- SAFE_DIVISION
- CONCAT_COALESCE
- DATE_FORMAT
- DATEINT_FORMAT
- TIME_FORMAT
- PREFER_CTE_ALIAS_COLUMN
- SUPPORTS_ORDER_BY_ALL
- HAS_DISTINCT_ARRAY_CONSTRUCTORS
- SUPPORTS_FIXED_SIZE_ARRAYS
- STRICT_JSON_PATH_SYNTAX
- ON_CONDITION_EMPTY_BEFORE_ERROR
- ARRAY_AGG_INCLUDES_NULLS
- REGEXP_EXTRACT_DEFAULT_GROUP
- CREATABLE_KIND_MAPPING
- DATE_PART_MAPPING
- TYPE_TO_EXPRESSIONS
- get_or_raise
- format_time
- settings
- case_sensitive
- can_identify
- quote_identifier
- to_json_path
- parse
- parse_into
- generate
- transpile
- tokenize
- tokenizer
- jsonpath_tokenizer
- parser
- generator
407 class Tokenizer(tokens.Tokenizer): 408 QUOTES = ["'", '"', '"""', "'''"] 409 COMMENTS = ["--", "#", ("/*", "*/")] 410 IDENTIFIERS = ["`"] 411 STRING_ESCAPES = ["\\"] 412 413 HEX_STRINGS = [("0x", ""), ("0X", "")] 414 415 BYTE_STRINGS = [ 416 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 417 ] 418 419 RAW_STRINGS = [ 420 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 421 ] 422 423 KEYWORDS = { 424 **tokens.Tokenizer.KEYWORDS, 425 "ANY TYPE": TokenType.VARIANT, 426 "BEGIN": TokenType.COMMAND, 427 "BEGIN TRANSACTION": TokenType.BEGIN, 428 "BYTEINT": TokenType.INT, 429 "BYTES": TokenType.BINARY, 430 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 431 "DATETIME": TokenType.TIMESTAMP, 432 "DECLARE": TokenType.COMMAND, 433 "ELSEIF": TokenType.COMMAND, 434 "EXCEPTION": TokenType.COMMAND, 435 "FLOAT64": TokenType.DOUBLE, 436 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 437 "MODEL": TokenType.MODEL, 438 "NOT DETERMINISTIC": TokenType.VOLATILE, 439 "RECORD": TokenType.STRUCT, 440 "TIMESTAMP": TokenType.TIMESTAMPTZ, 441 } 442 KEYWORDS.pop("DIV") 443 KEYWORDS.pop("VALUES") 444 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
446 class Parser(parser.Parser): 447 PREFIXED_PIVOT_COLUMNS = True 448 LOG_DEFAULTS_TO_LN = True 449 SUPPORTS_IMPLICIT_UNNEST = True 450 451 FUNCTIONS = { 452 **parser.Parser.FUNCTIONS, 453 "DATE": _build_date, 454 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 455 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 456 "DATE_TRUNC": lambda args: exp.DateTrunc( 457 unit=exp.Literal.string(str(seq_get(args, 1))), 458 this=seq_get(args, 0), 459 zone=seq_get(args, 2), 460 ), 461 "DATETIME": _build_datetime, 462 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 463 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 464 "DIV": binary_from_function(exp.IntDiv), 465 "EDIT_DISTANCE": _build_levenshtein, 466 "FORMAT_DATE": lambda args: exp.TimeToStr( 467 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 468 ), 469 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 470 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 471 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 472 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 473 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 474 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 475 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 476 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 477 "MD5": exp.MD5Digest.from_arg_list, 478 "TO_HEX": _build_to_hex, 479 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 480 [seq_get(args, 1), seq_get(args, 0)] 481 ), 482 "PARSE_TIMESTAMP": _build_parse_timestamp, 483 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 484 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 485 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 486 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 487 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 488 ), 489 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 490 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 491 "SPLIT": lambda args: exp.Split( 492 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 493 this=seq_get(args, 0), 494 expression=seq_get(args, 1) or exp.Literal.string(","), 495 ), 496 "TIME": _build_time, 497 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 498 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 499 "TIMESTAMP": _build_timestamp, 500 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 501 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 502 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 503 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 504 ), 505 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 506 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 507 ), 508 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 509 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 510 "FORMAT_DATETIME": lambda args: exp.TimeToStr( 511 this=exp.TsOrDsToTimestamp(this=seq_get(args, 1)), format=seq_get(args, 0) 512 ), 513 } 514 515 FUNCTION_PARSERS = { 516 **parser.Parser.FUNCTION_PARSERS, 517 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 518 } 519 FUNCTION_PARSERS.pop("TRIM") 520 521 NO_PAREN_FUNCTIONS = { 522 **parser.Parser.NO_PAREN_FUNCTIONS, 523 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 524 } 525 526 NESTED_TYPE_TOKENS = { 527 *parser.Parser.NESTED_TYPE_TOKENS, 528 TokenType.TABLE, 529 } 530 531 PROPERTY_PARSERS = { 532 **parser.Parser.PROPERTY_PARSERS, 533 "NOT DETERMINISTIC": lambda self: self.expression( 534 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 535 ), 536 "OPTIONS": lambda self: self._parse_with_property(), 537 } 538 539 CONSTRAINT_PARSERS = { 540 **parser.Parser.CONSTRAINT_PARSERS, 541 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 542 } 543 544 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 545 RANGE_PARSERS.pop(TokenType.OVERLAPS) 546 547 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 548 549 STATEMENT_PARSERS = { 550 **parser.Parser.STATEMENT_PARSERS, 551 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 552 TokenType.END: lambda self: self._parse_as_command(self._prev), 553 TokenType.FOR: lambda self: self._parse_for_in(), 554 } 555 556 BRACKET_OFFSETS = { 557 "OFFSET": (0, False), 558 "ORDINAL": (1, False), 559 "SAFE_OFFSET": (0, True), 560 "SAFE_ORDINAL": (1, True), 561 } 562 563 def _parse_for_in(self) -> exp.ForIn: 564 this = self._parse_range() 565 self._match_text_seq("DO") 566 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 567 568 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 569 this = super()._parse_table_part(schema=schema) or self._parse_number() 570 571 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 572 if isinstance(this, exp.Identifier): 573 table_name = this.name 574 while self._match(TokenType.DASH, advance=False) and self._next: 575 text = "" 576 while self._is_connected() and self._curr.token_type != TokenType.DOT: 577 self._advance() 578 text += self._prev.text 579 table_name += text 580 581 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 582 elif isinstance(this, exp.Literal): 583 table_name = this.name 584 585 if self._is_connected() and self._parse_var(any_token=True): 586 table_name += self._prev.text 587 588 this = exp.Identifier(this=table_name, quoted=True) 589 590 return this 591 592 def _parse_table_parts( 593 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 594 ) -> exp.Table: 595 table = super()._parse_table_parts( 596 schema=schema, is_db_reference=is_db_reference, wildcard=True 597 ) 598 599 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 600 if not table.catalog: 601 if table.db: 602 parts = table.db.split(".") 603 if len(parts) == 2 and not table.args["db"].quoted: 604 table.set("catalog", exp.Identifier(this=parts[0])) 605 table.set("db", exp.Identifier(this=parts[1])) 606 else: 607 parts = table.name.split(".") 608 if len(parts) == 2 and not table.this.quoted: 609 table.set("db", exp.Identifier(this=parts[0])) 610 table.set("this", exp.Identifier(this=parts[1])) 611 612 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 613 alias = table.this 614 catalog, db, this, *rest = ( 615 exp.to_identifier(p, quoted=True) 616 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 617 ) 618 619 if rest and this: 620 this = exp.Dot.build([this, *rest]) # type: ignore 621 622 table = exp.Table( 623 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 624 ) 625 table.meta["quoted_table"] = True 626 else: 627 alias = None 628 629 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 630 # dataset, so if the project identifier is omitted we need to fix the ast so that 631 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 632 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 633 # views, because it would seem like the "catalog" part is set, when it'd actually 634 # be the region/dataset. Merging the two identifiers into a single one is done to 635 # avoid producing a 4-part Table reference, which would cause issues in the schema 636 # module, when there are 3-part table names mixed with information schema views. 637 # 638 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 639 table_parts = table.parts 640 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 641 # We need to alias the table here to avoid breaking existing qualified columns. 642 # This is expected to be safe, because if there's an actual alias coming up in 643 # the token stream, it will overwrite this one. If there isn't one, we are only 644 # exposing the name that can be used to reference the view explicitly (a no-op). 645 exp.alias_( 646 table, 647 t.cast(exp.Identifier, alias or table_parts[-1]), 648 table=True, 649 copy=False, 650 ) 651 652 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 653 table.set("this", exp.Identifier(this=info_schema_view, quoted=True)) 654 table.set("db", seq_get(table_parts, -3)) 655 table.set("catalog", seq_get(table_parts, -4)) 656 657 return table 658 659 def _parse_column(self) -> t.Optional[exp.Expression]: 660 column = super()._parse_column() 661 if isinstance(column, exp.Column): 662 parts = column.parts 663 if any("." in p.name for p in parts): 664 catalog, db, table, this, *rest = ( 665 exp.to_identifier(p, quoted=True) 666 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 667 ) 668 669 if rest and this: 670 this = exp.Dot.build([this, *rest]) # type: ignore 671 672 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 673 column.meta["quoted_column"] = True 674 675 return column 676 677 @t.overload 678 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 679 680 @t.overload 681 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 682 683 def _parse_json_object(self, agg=False): 684 json_object = super()._parse_json_object() 685 array_kv_pair = seq_get(json_object.expressions, 0) 686 687 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 688 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 689 if ( 690 array_kv_pair 691 and isinstance(array_kv_pair.this, exp.Array) 692 and isinstance(array_kv_pair.expression, exp.Array) 693 ): 694 keys = array_kv_pair.this.expressions 695 values = array_kv_pair.expression.expressions 696 697 json_object.set( 698 "expressions", 699 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 700 ) 701 702 return json_object 703 704 def _parse_bracket( 705 self, this: t.Optional[exp.Expression] = None 706 ) -> t.Optional[exp.Expression]: 707 bracket = super()._parse_bracket(this) 708 709 if this is bracket: 710 return bracket 711 712 if isinstance(bracket, exp.Bracket): 713 for expression in bracket.expressions: 714 name = expression.name.upper() 715 716 if name not in self.BRACKET_OFFSETS: 717 break 718 719 offset, safe = self.BRACKET_OFFSETS[name] 720 bracket.set("offset", offset) 721 bracket.set("safe", safe) 722 expression.replace(expression.expressions[0]) 723 724 return bracket 725 726 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 727 unnest = super()._parse_unnest(with_alias=with_alias) 728 729 if not unnest: 730 return None 731 732 unnest_expr = seq_get(unnest.expressions, 0) 733 if unnest_expr: 734 from sqlglot.optimizer.annotate_types import annotate_types 735 736 unnest_expr = annotate_types(unnest_expr) 737 738 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 739 # in contrast to other dialects such as DuckDB which flattens only the array by default 740 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 741 array_elem.is_type(exp.DataType.Type.STRUCT) 742 for array_elem in unnest_expr._type.expressions 743 ): 744 unnest.set("explode_array", True) 745 746 return unnest
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- INTERVAL_VARS
- ALIAS_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- OPERATION_MODIFIERS
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- errors
- sql
748 class Generator(generator.Generator): 749 INTERVAL_ALLOWS_PLURAL_FORM = False 750 JOIN_HINTS = False 751 QUERY_HINTS = False 752 TABLE_HINTS = False 753 LIMIT_FETCH = "LIMIT" 754 RENAME_TABLE_WITH_DB = False 755 NVL2_SUPPORTED = False 756 UNNEST_WITH_ORDINALITY = False 757 COLLATE_IS_FUNC = True 758 LIMIT_ONLY_LITERALS = True 759 SUPPORTS_TABLE_ALIAS_COLUMNS = False 760 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 761 JSON_KEY_VALUE_PAIR_SEP = "," 762 NULL_ORDERING_SUPPORTED = False 763 IGNORE_NULLS_IN_FUNC = True 764 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 765 CAN_IMPLEMENT_ARRAY_ANY = True 766 SUPPORTS_TO_NUMBER = False 767 NAMED_PLACEHOLDER_TOKEN = "@" 768 HEX_FUNC = "TO_HEX" 769 WITH_PROPERTIES_PREFIX = "OPTIONS" 770 SUPPORTS_EXPLODING_PROJECTIONS = False 771 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 772 SUPPORTS_UNIX_SECONDS = True 773 774 TRANSFORMS = { 775 **generator.Generator.TRANSFORMS, 776 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 777 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 778 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 779 exp.Array: inline_array_unless_query, 780 exp.ArrayContains: _array_contains_sql, 781 exp.ArrayFilter: filter_array_using_unnest, 782 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 783 exp.CollateProperty: lambda self, e: ( 784 f"DEFAULT COLLATE {self.sql(e, 'this')}" 785 if e.args.get("default") 786 else f"COLLATE {self.sql(e, 'this')}" 787 ), 788 exp.Commit: lambda *_: "COMMIT TRANSACTION", 789 exp.CountIf: rename_func("COUNTIF"), 790 exp.Create: _create_sql, 791 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 792 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 793 exp.DateDiff: lambda self, e: self.func( 794 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 795 ), 796 exp.DateFromParts: rename_func("DATE"), 797 exp.DateStrToDate: datestrtodate_sql, 798 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 799 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 800 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 801 exp.DateTrunc: lambda self, e: self.func( 802 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 803 ), 804 exp.FromTimeZone: lambda self, e: self.func( 805 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 806 ), 807 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 808 exp.GroupConcat: rename_func("STRING_AGG"), 809 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 810 exp.If: if_sql(false_value="NULL"), 811 exp.ILike: no_ilike_sql, 812 exp.IntDiv: rename_func("DIV"), 813 exp.JSONFormat: rename_func("TO_JSON_STRING"), 814 exp.Levenshtein: _levenshtein_sql, 815 exp.Max: max_or_greatest, 816 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 817 exp.MD5Digest: rename_func("MD5"), 818 exp.Min: min_or_least, 819 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 820 exp.RegexpExtract: lambda self, e: self.func( 821 "REGEXP_EXTRACT", 822 e.this, 823 e.expression, 824 e.args.get("position"), 825 e.args.get("occurrence"), 826 ), 827 exp.RegexpExtractAll: lambda self, e: self.func( 828 "REGEXP_EXTRACT_ALL", e.this, e.expression 829 ), 830 exp.RegexpReplace: regexp_replace_sql, 831 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 832 exp.ReturnsProperty: _returnsproperty_sql, 833 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 834 exp.Select: transforms.preprocess( 835 [ 836 transforms.explode_to_unnest(), 837 transforms.unqualify_unnest, 838 transforms.eliminate_distinct_on, 839 _alias_ordered_group, 840 transforms.eliminate_semi_and_anti_joins, 841 ] 842 ), 843 exp.SHA: rename_func("SHA1"), 844 exp.SHA2: sha256_sql, 845 exp.StabilityProperty: lambda self, e: ( 846 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 847 ), 848 exp.String: rename_func("STRING"), 849 exp.StrToDate: _str_to_datetime_sql, 850 exp.StrToTime: _str_to_datetime_sql, 851 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 852 exp.TimeFromParts: rename_func("TIME"), 853 exp.TimestampFromParts: rename_func("DATETIME"), 854 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 855 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 856 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 857 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 858 exp.TimeStrToTime: timestrtotime_sql, 859 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 860 exp.TsOrDsAdd: _ts_or_ds_add_sql, 861 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 862 exp.TsOrDsToTime: rename_func("TIME"), 863 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 864 exp.Unhex: rename_func("FROM_HEX"), 865 exp.UnixDate: rename_func("UNIX_DATE"), 866 exp.UnixToTime: _unix_to_time_sql, 867 exp.Uuid: lambda *_: "GENERATE_UUID()", 868 exp.Values: _derived_table_values_to_unnest, 869 exp.VariancePop: rename_func("VAR_POP"), 870 } 871 872 SUPPORTED_JSON_PATH_PARTS = { 873 exp.JSONPathKey, 874 exp.JSONPathRoot, 875 exp.JSONPathSubscript, 876 } 877 878 TYPE_MAPPING = { 879 **generator.Generator.TYPE_MAPPING, 880 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 881 exp.DataType.Type.BIGINT: "INT64", 882 exp.DataType.Type.BINARY: "BYTES", 883 exp.DataType.Type.BOOLEAN: "BOOL", 884 exp.DataType.Type.CHAR: "STRING", 885 exp.DataType.Type.DECIMAL: "NUMERIC", 886 exp.DataType.Type.DOUBLE: "FLOAT64", 887 exp.DataType.Type.FLOAT: "FLOAT64", 888 exp.DataType.Type.INT: "INT64", 889 exp.DataType.Type.NCHAR: "STRING", 890 exp.DataType.Type.NVARCHAR: "STRING", 891 exp.DataType.Type.SMALLINT: "INT64", 892 exp.DataType.Type.TEXT: "STRING", 893 exp.DataType.Type.TIMESTAMP: "DATETIME", 894 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 895 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 896 exp.DataType.Type.TINYINT: "INT64", 897 exp.DataType.Type.ROWVERSION: "BYTES", 898 exp.DataType.Type.UUID: "STRING", 899 exp.DataType.Type.VARBINARY: "BYTES", 900 exp.DataType.Type.VARCHAR: "STRING", 901 exp.DataType.Type.VARIANT: "ANY TYPE", 902 } 903 904 PROPERTIES_LOCATION = { 905 **generator.Generator.PROPERTIES_LOCATION, 906 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 907 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 908 } 909 910 # WINDOW comes after QUALIFY 911 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 912 AFTER_HAVING_MODIFIER_TRANSFORMS = { 913 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 914 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 915 } 916 917 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 918 RESERVED_KEYWORDS = { 919 "all", 920 "and", 921 "any", 922 "array", 923 "as", 924 "asc", 925 "assert_rows_modified", 926 "at", 927 "between", 928 "by", 929 "case", 930 "cast", 931 "collate", 932 "contains", 933 "create", 934 "cross", 935 "cube", 936 "current", 937 "default", 938 "define", 939 "desc", 940 "distinct", 941 "else", 942 "end", 943 "enum", 944 "escape", 945 "except", 946 "exclude", 947 "exists", 948 "extract", 949 "false", 950 "fetch", 951 "following", 952 "for", 953 "from", 954 "full", 955 "group", 956 "grouping", 957 "groups", 958 "hash", 959 "having", 960 "if", 961 "ignore", 962 "in", 963 "inner", 964 "intersect", 965 "interval", 966 "into", 967 "is", 968 "join", 969 "lateral", 970 "left", 971 "like", 972 "limit", 973 "lookup", 974 "merge", 975 "natural", 976 "new", 977 "no", 978 "not", 979 "null", 980 "nulls", 981 "of", 982 "on", 983 "or", 984 "order", 985 "outer", 986 "over", 987 "partition", 988 "preceding", 989 "proto", 990 "qualify", 991 "range", 992 "recursive", 993 "respect", 994 "right", 995 "rollup", 996 "rows", 997 "select", 998 "set", 999 "some", 1000 "struct", 1001 "tablesample", 1002 "then", 1003 "to", 1004 "treat", 1005 "true", 1006 "unbounded", 1007 "union", 1008 "unnest", 1009 "using", 1010 "when", 1011 "where", 1012 "window", 1013 "with", 1014 "within", 1015 } 1016 1017 def mod_sql(self, expression: exp.Mod) -> str: 1018 this = expression.this 1019 expr = expression.expression 1020 return self.func( 1021 "MOD", 1022 this.unnest() if isinstance(this, exp.Paren) else this, 1023 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1024 ) 1025 1026 def column_parts(self, expression: exp.Column) -> str: 1027 if expression.meta.get("quoted_column"): 1028 # If a column reference is of the form `dataset.table`.name, we need 1029 # to preserve the quoted table path, otherwise the reference breaks 1030 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1031 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1032 return f"{table_path}.{self.sql(expression, 'this')}" 1033 1034 return super().column_parts(expression) 1035 1036 def table_parts(self, expression: exp.Table) -> str: 1037 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1038 # we need to make sure the correct quoting is used in each case. 1039 # 1040 # For example, if there is a CTE x that clashes with a schema name, then the former will 1041 # return the table y in that schema, whereas the latter will return the CTE's y column: 1042 # 1043 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1044 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1045 if expression.meta.get("quoted_table"): 1046 table_parts = ".".join(p.name for p in expression.parts) 1047 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1048 1049 return super().table_parts(expression) 1050 1051 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1052 if isinstance(expression.this, exp.TsOrDsToTimestamp): 1053 func_name = "FORMAT_DATETIME" 1054 else: 1055 func_name = "FORMAT_DATE" 1056 this = ( 1057 expression.this 1058 if isinstance(expression.this, (exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1059 else expression 1060 ) 1061 return self.func(func_name, self.format_time(expression), this.this) 1062 1063 def eq_sql(self, expression: exp.EQ) -> str: 1064 # Operands of = cannot be NULL in BigQuery 1065 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1066 if not isinstance(expression.parent, exp.Update): 1067 return "NULL" 1068 1069 return self.binary(expression, "=") 1070 1071 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1072 parent = expression.parent 1073 1074 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1075 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1076 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1077 return self.func( 1078 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1079 ) 1080 1081 return super().attimezone_sql(expression) 1082 1083 def trycast_sql(self, expression: exp.TryCast) -> str: 1084 return self.cast_sql(expression, safe_prefix="SAFE_") 1085 1086 def bracket_sql(self, expression: exp.Bracket) -> str: 1087 this = expression.this 1088 expressions = expression.expressions 1089 1090 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1091 arg = expressions[0] 1092 if arg.type is None: 1093 from sqlglot.optimizer.annotate_types import annotate_types 1094 1095 arg = annotate_types(arg) 1096 1097 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1098 # BQ doesn't support bracket syntax with string values for structs 1099 return f"{self.sql(this)}.{arg.name}" 1100 1101 expressions_sql = self.expressions(expression, flat=True) 1102 offset = expression.args.get("offset") 1103 1104 if offset == 0: 1105 expressions_sql = f"OFFSET({expressions_sql})" 1106 elif offset == 1: 1107 expressions_sql = f"ORDINAL({expressions_sql})" 1108 elif offset is not None: 1109 self.unsupported(f"Unsupported array offset: {offset}") 1110 1111 if expression.args.get("safe"): 1112 expressions_sql = f"SAFE_{expressions_sql}" 1113 1114 return f"{self.sql(this)}[{expressions_sql}]" 1115 1116 def in_unnest_op(self, expression: exp.Unnest) -> str: 1117 return self.sql(expression) 1118 1119 def version_sql(self, expression: exp.Version) -> str: 1120 if expression.name == "TIMESTAMP": 1121 expression.set("this", "SYSTEM_TIME") 1122 return super().version_sql(expression)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1026 def column_parts(self, expression: exp.Column) -> str: 1027 if expression.meta.get("quoted_column"): 1028 # If a column reference is of the form `dataset.table`.name, we need 1029 # to preserve the quoted table path, otherwise the reference breaks 1030 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1031 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1032 return f"{table_path}.{self.sql(expression, 'this')}" 1033 1034 return super().column_parts(expression)
1036 def table_parts(self, expression: exp.Table) -> str: 1037 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1038 # we need to make sure the correct quoting is used in each case. 1039 # 1040 # For example, if there is a CTE x that clashes with a schema name, then the former will 1041 # return the table y in that schema, whereas the latter will return the CTE's y column: 1042 # 1043 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1044 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1045 if expression.meta.get("quoted_table"): 1046 table_parts = ".".join(p.name for p in expression.parts) 1047 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1048 1049 return super().table_parts(expression)
1051 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1052 if isinstance(expression.this, exp.TsOrDsToTimestamp): 1053 func_name = "FORMAT_DATETIME" 1054 else: 1055 func_name = "FORMAT_DATE" 1056 this = ( 1057 expression.this 1058 if isinstance(expression.this, (exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1059 else expression 1060 ) 1061 return self.func(func_name, self.format_time(expression), this.this)
1063 def eq_sql(self, expression: exp.EQ) -> str: 1064 # Operands of = cannot be NULL in BigQuery 1065 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1066 if not isinstance(expression.parent, exp.Update): 1067 return "NULL" 1068 1069 return self.binary(expression, "=")
1071 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1072 parent = expression.parent 1073 1074 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1075 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1076 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1077 return self.func( 1078 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1079 ) 1080 1081 return super().attimezone_sql(expression)
1086 def bracket_sql(self, expression: exp.Bracket) -> str: 1087 this = expression.this 1088 expressions = expression.expressions 1089 1090 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1091 arg = expressions[0] 1092 if arg.type is None: 1093 from sqlglot.optimizer.annotate_types import annotate_types 1094 1095 arg = annotate_types(arg) 1096 1097 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1098 # BQ doesn't support bracket syntax with string values for structs 1099 return f"{self.sql(this)}.{arg.name}" 1100 1101 expressions_sql = self.expressions(expression, flat=True) 1102 offset = expression.args.get("offset") 1103 1104 if offset == 0: 1105 expressions_sql = f"OFFSET({expressions_sql})" 1106 elif offset == 1: 1107 expressions_sql = f"ORDINAL({expressions_sql})" 1108 elif offset is not None: 1109 self.unsupported(f"Unsupported array offset: {offset}") 1110 1111 if expression.args.get("safe"): 1112 expressions_sql = f"SAFE_{expressions_sql}" 1113 1114 return f"{self.sql(this)}[{expressions_sql}]"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- pad_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- transformcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- cast_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- try_sql
- log_sql
- use_sql
- binary
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql