Skip to content

Commit

Permalink
Merge branch 'microsoft:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
DetachHead authored Mar 22, 2024
2 parents d5a9c85 + 3da27fd commit 00f1bb2
Show file tree
Hide file tree
Showing 7 changed files with 3,691 additions and 2,861 deletions.
205 changes: 205 additions & 0 deletions build/generateUnicodeTables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# generateUnicodeTables.py
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
#
# Generates the content of unicode.ts based on the official Unicode
# character database.

import sys
import urllib.request
from io import TextIOWrapper


class Character:
def __init__(self, code: int, category: str, *, end: int | None = None):
self.code = code
self.category = category

self.hasSurrogate = code > 0xFFFF
if self.hasSurrogate:
unicodeChar = chr(code)
utf16 = unicodeChar.encode("utf-16")
rawHex = utf16.hex()
hex = rawHex[4:]

self.highSurrogate = int(hex[2:4] + hex[0:2], base=16)
self.lowSurrogate = int(hex[6:8] + hex[4:6], base=16)


class CharacterRange:
def __init__(self, start: Character, end: Character):
self.start = start
self.end = end


def downloadUnicodeData(unicodeVersion: str) -> str:
url = f"https://www.unicode.org/Public/{unicodeVersion}.0/ucd/UnicodeData.txt"
(path, _) = urllib.request.urlretrieve(url)
return path


def parseFile(filePath: str) -> list[Character]:
with open(filePath, "r") as reader:
lines = reader.readlines()
chars: list[Character] = []
for i in range(len(lines)):
line = lines[i]
splitOnSemicolon = line.split(";")
charCode = int(splitOnSemicolon[0], base=16)
category = splitOnSemicolon[2]

if splitOnSemicolon[1].endswith(", First>"):
# Legacy range syntax
# D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
# DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
nextLine = lines[i + 1]
nextSplitOnSemicolon = nextLine.split(";")
nextCharCode = int(nextSplitOnSemicolon[0], base=16)
for ord in range(charCode, nextCharCode + 1):
chars.append(Character(ord, category))
elif splitOnSemicolon[1].endswith(", Last>"):
continue
else:
chars.append(Character(charCode, category))

return chars


# Given a collection of characters, returns a list of ranges of contiguous
# characters. Contiguous means that the character codes are sequential with
# no gaps and the characters all have the same category. For character codes
# greater than 0xFFFF, contiguous means that the high surrogate is the same
# and the low surrogate values are sequential with no gaps. So, two charcter
# codes might be sequential numerically but have different high surrogates,
# and therefore would not be members of the same range.
def getSurrogateRanges(chars: list[Character]) -> list[CharacterRange]:
surrogateRanges: list[CharacterRange] = []

consecutiveRangeStartChar: Character | None = None
previousChar: Character | None = None
for char in chars:
if not consecutiveRangeStartChar:
consecutiveRangeStartChar = char

if previousChar:
if not previousChar.hasSurrogate and not char.hasSurrogate:
if (
char.code == previousChar.code + 1
and char.category == previousChar.category
):
pass
elif not previousChar.hasSurrogate and char.hasSurrogate:
consecutiveRangeStartChar = char
else:
if (
char.highSurrogate == previousChar.highSurrogate
and char.lowSurrogate == previousChar.lowSurrogate + 1
and char.category == previousChar.category
):
pass
else:
surrogateRanges.append(
CharacterRange(consecutiveRangeStartChar, previousChar)
)
consecutiveRangeStartChar = char

previousChar = char

return surrogateRanges


# Write out a table of all character codes within the specified category. These are
# the full hex character codes (Unicode code points) not surrogate values. Sequential
# ranges of character codes are written as arrays of two numbers (start and end) to
# save space.
def writeRangeTable(writer: TextIOWrapper, category: str, chars: list[Character]):
chars = [ch for ch in chars if ch.category == category]

writer.write(f"export const unicode{category}: UnicodeRangeTable = [\n")

consecutiveRangeStartChar: Character | None = None
for i in range(len(chars)):
char = chars[i]

if not consecutiveRangeStartChar:
consecutiveRangeStartChar = char

if i + 1 >= len(chars) or chars[i + 1].code != char.code + 1:
if consecutiveRangeStartChar.code == char.code:
writer.write(f" 0x{consecutiveRangeStartChar.code:04X},\n")
else:
writer.write(f" [0x{consecutiveRangeStartChar.code:04X}, 0x{char.code:04X}],\n")

consecutiveRangeStartChar = None

writer.write("];\n\n")


# Write out a table of all characters within the specified category using their UTF-16
# values. Characters are grouped by high surrogate value. Sequential ranges of low
# surrogate values are written as arrays of two numbers (start and end) to save space.
def writeSurrogateRangeTable(
writer: TextIOWrapper, category: str, surrogateRanges: list[CharacterRange]
):
surrogateRanges = [r for r in surrogateRanges if r.start.category == category]

if len(surrogateRanges) == 0:
return

writer.write(
f"export const unicode{category}Surrogate: UnicodeSurrogateRangeTable = {{\n"
)

previousCharRange: CharacterRange | None = None
for charRange in surrogateRanges:
if (
previousCharRange
and charRange.start.highSurrogate != previousCharRange.start.highSurrogate
):
writer.write(" ],\n")
previousCharRange = None

if not previousCharRange:
writer.write(f" 0x{charRange.start.highSurrogate:04X}: [\n")
previousCharRange = charRange

if charRange.start.lowSurrogate == charRange.end.lowSurrogate:
writer.write(f" 0x{charRange.start.lowSurrogate:04X}, // 0x{charRange.start.code:04X}\n")
else:
writer.write(
f" [0x{charRange.start.lowSurrogate:04X}, 0x{charRange.end.lowSurrogate:04X}], // 0x{charRange.start.code:04X}..0x{charRange.end.code:04X}\n"
)

writer.write(" ],\n")
writer.write("};\n\n")


unicodeVersion = "15.1" if len(sys.argv) <= 1 else sys.argv[1]
path = downloadUnicodeData(unicodeVersion)
chars = parseFile(path)
surrogateRanges = getSurrogateRanges(chars)

with open("packages/pyright-internal/src/parser/unicode.ts", "w") as writer:
writer.write(
f"""/*
* unicode.ts
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT license.
*
* Tables that encode Unicode character codes for various Unicode-
* defined categories used in the Python spec.
*
* Generated by build/generateUnicodeTables.py from the UnicodeData.txt
* metadata file for Unicode {unicodeVersion}.
*/
export type UnicodeRange = [number, number] | number;
export type UnicodeRangeTable = UnicodeRange[];
export type UnicodeSurrogateRangeTable = {{ [surrogate: number]: UnicodeRange[] }};
"""
)

for category in ["Lu", "Ll", "Lt", "Lo", "Lm", "Nl", "Mn", "Mc", "Nd", "Pc"]:
writeRangeTable(writer, category, chars)
writeSurrogateRangeTable(writer, category, surrogateRanges)
12 changes: 4 additions & 8 deletions packages/pyright-internal/src/analyzer/typeEvaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13958,7 +13958,7 @@ export function createTypeEvaluator(importLookup: ImportLookup, evaluatorOptions
isNarrowable: boolean
): Type | undefined {
// If the expected type is Any, the resulting type becomes Any.
if (isAnyOrUnknown(inferenceContext.expectedType)) {
if (isAny(inferenceContext.expectedType)) {
return inferenceContext.expectedType;
}

Expand Down Expand Up @@ -17961,20 +17961,16 @@ export function createTypeEvaluator(importLookup: ImportLookup, evaluatorOptions
functionType: FunctionType,
typeParametersSeen: TypeVarType[]
) {
const typeVarsInReturnType = getTypeVarArgumentsRecursive(returnType);
const typeVarsInReturnType = getTypeVarArgumentsRecursive(returnType).filter(
(t) => t.scopeId === functionType.details.typeVarScopeId
);
const rescopedTypeVars: TypeVarType[] = [];

typeVarsInReturnType.forEach((typeVar) => {
if (TypeBase.isInstantiable(typeVar)) {
typeVar = TypeVarType.cloneAsInstance(typeVar);
}

// If this type variable isn't scoped to this function, it is probably
// associated with an outer scope.
if (typeVar.scopeId !== functionType.details.typeVarScopeId) {
return;
}

// If this type variable was already seen in one or more input parameters,
// don't attempt to rescope it.
if (typeParametersSeen.some((tp) => isTypeSame(convertToInstance(tp), typeVar))) {
Expand Down
3 changes: 3 additions & 0 deletions packages/pyright-internal/src/parser/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,10 @@ export class Parser {
this._getNextToken();
}

const wasParsingTypeAnnotation = this._isParsingTypeAnnotation;
this._isParsingTypeAnnotation = true;
const expression = this._parseTestExpression(/* allowAssignmentExpression */ false);
this._isParsingTypeAnnotation = wasParsingTypeAnnotation;

return TypeAliasNode.create(typeToken, name, expression, typeParameters);
}
Expand Down
Loading

0 comments on commit 00f1bb2

Please sign in to comment.