Skip to content

Commit

Permalink
Pure python cython implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
overfl0 committed May 10, 2023
1 parent ca450ad commit 563f37e
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 74 deletions.
17 changes: 9 additions & 8 deletions armaclass/cython_stubs.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from . import Shadow as cython

PyUnicode_4BYTE_KIND = None
cython.bytes = bytes

def PyUnicode_KIND(data):
return None
def PyBytes_AsString(b):
return b


def PyUnicode_FromKindAndData(kind, data, size):
return data[:size]
def PyBytes_AS_STRING(b):
return b


def PyUnicode_READ(kind, data, pos):
return data[pos]
def PyBytes_GET_SIZE(b):
return len(b)


def PyUnicode_DATA(data):
return data
def PyUnicode_DecodeUTF8(data, len, errors):
return data.decode('utf-8', errors=errors)


class vector:
Expand Down
38 changes: 19 additions & 19 deletions armaclass/parser.pxd
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
cdef Py_UCS4 QUOTE
cdef Py_UCS4 SEMICOLON
cdef Py_UCS4 COLON
cdef Py_UCS4 EQUALS
cdef Py_UCS4 CURLY_OPEN
cdef Py_UCS4 CURLY_CLOSE
cdef Py_UCS4 SQUARE_OPEN
cdef Py_UCS4 SQUARE_CLOSE
cdef Py_UCS4 COMMA
cdef Py_UCS4 PLUS
cdef Py_UCS4 MINUS
cdef Py_UCS4 SLASH
cdef Py_UCS4 DOLLAR
cdef Py_UCS4 ASTERISK
cdef Py_UCS4 NEWLINE
cdef char QUOTE
cdef char SEMICOLON
cdef char COLON
cdef char EQUALS
cdef char CURLY_OPEN
cdef char CURLY_CLOSE
cdef char SQUARE_OPEN
cdef char SQUARE_CLOSE
cdef char COMMA
cdef char PLUS
cdef char MINUS
cdef char SLASH
cdef char DOLLAR
cdef char ASTERISK
cdef char NEWLINE

cdef unicode NEWLINE_U
cdef unicode END_COMMENT_U
cdef unicode QUOTE_U
cdef unicode STR
cdef bytes NEWLINE_U
cdef bytes END_COMMENT_U
cdef bytes QUOTE_U
cdef bytes STR
89 changes: 43 additions & 46 deletions armaclass/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,15 @@
import cython
except ModuleNotFoundError:
from .cython_stubs import (cython,
PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND, PyUnicode_DATA,
PyUnicode_KIND, PyUnicode_READ,
PyBytes_GET_SIZE, PyBytes_AS_STRING, PyBytes_AsString, PyUnicode_DecodeUTF8,
vector)

if cython.compiled:
from cython.cimports.cpython import (PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND, PyUnicode_DATA,
PyUnicode_KIND, PyUnicode_READ)
from cython.cimports.cpython import PyBytes_GET_SIZE, PyBytes_AS_STRING, PyBytes_AsString, PyUnicode_DecodeUTF8
from cython.cimports.libcpp.vector import vector
else:
from .cython_stubs import (cython,
PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND, PyUnicode_DATA,
PyUnicode_KIND, PyUnicode_READ,
PyBytes_GET_SIZE, PyBytes_AS_STRING, PyBytes_AsString, PyUnicode_DecodeUTF8,
vector)

QUOTE = ord('"')
Expand Down Expand Up @@ -49,13 +46,10 @@ class ParseError(RuntimeError):
@cython.cclass
class Parser:
currentPosition: cython.Py_ssize_t
input_string: cython.unicode
input_string: cython.p_char
input_string_len: cython.Py_ssize_t
translations: dict

data: cython.p_void
data_kind: cython.int

@cython.cfunc
def ensure(self, condition: cython.bint, message='Error'):
if condition:
Expand All @@ -66,6 +60,8 @@ def ensure(self, condition: cython.bint, message='Error'):

@cython.cfunc
@cython.inline
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.exceptval(check=False)
def detectComment(self) -> cython.void:
indexCommentEnd: cython.Py_ssize_t
Expand All @@ -74,24 +70,24 @@ def detectComment(self) -> cython.void:
if self.currentPosition >= self.input_string_len:
return

if PyUnicode_READ(self.data_kind, self.data, self.currentPosition) == SLASH:
if self.input_string[self.currentPosition] == SLASH:
if self.currentPosition + 1 >= self.input_string_len:
return

if PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == SLASH:
if self.input_string[self.currentPosition + 1] == SLASH:
indexOfLinefeed = self.input_string.find(NEWLINE_U, self.currentPosition)
if indexOfLinefeed == -1:
self.currentPosition = self.input_string_len
else:
self.currentPosition = indexOfLinefeed
elif PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == ASTERISK:
elif self.input_string[self.currentPosition + 1] == ASTERISK:
indexCommentEnd = self.input_string.find(END_COMMENT_U, self.currentPosition)
self.currentPosition = self.input_string_len if indexCommentEnd == -1 else indexCommentEnd + 2

@cython.cfunc
@cython.inline
@cython.exceptval(check=False)
def next(self) -> cython.Py_UCS4:
def next(self) -> cython.char:
self.currentPosition += 1
self.detectComment()
return self.current()
Expand All @@ -105,19 +101,19 @@ def nextWithoutCommentDetection(self) -> cython.void:
@cython.cfunc
@cython.inline
@cython.exceptval(check=False)
def current(self) -> cython.Py_UCS4:
def current(self) -> cython.char:
if self.currentPosition >= self.input_string_len:
return -1

return PyUnicode_READ(self.data_kind, self.data, self.currentPosition)
return self.input_string[self.currentPosition]

@cython.cfunc
@cython.inline
@cython.exceptval(check=False)
def weHaveADoubleQuote(self) -> cython.bint:
if self.input_string_len >= self.currentPosition + 2 and \
PyUnicode_READ(self.data_kind, self.data, self.currentPosition) == QUOTE and \
PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == QUOTE:
self.input_string[self.currentPosition] == QUOTE and \
self.input_string[self.currentPosition + 1] == QUOTE:
return True
return False

Expand All @@ -127,12 +123,12 @@ def weHaveADoubleQuote(self) -> cython.bint:
def weHaveAStringLineBreak(self) -> cython.bint:
if (
self.input_string_len >= self.currentPosition + 6 and
PyUnicode_READ(self.data_kind, self.data, self.currentPosition) == QUOTE and
PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == ord(' ') and
PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 2) == ord('\\') and
PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 3) == ord('n') and
PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 4) == ord(' ') and
PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 5) == QUOTE
self.input_string[self.currentPosition] == QUOTE and
self.input_string[self.currentPosition + 1] == ord(' ') and
self.input_string[self.currentPosition + 2] == ord('\\') and
self.input_string[self.currentPosition + 3] == ord('n') and
self.input_string[self.currentPosition + 4] == ord(' ') and
self.input_string[self.currentPosition + 5] == QUOTE
):
return True
return False
Expand All @@ -148,7 +144,7 @@ def forwardToNextQuote(self) -> cython.void:
@cython.cfunc
@cython.inline
def parseString(self) -> cython.unicode:
result: vector[cython.Py_UCS4]# = vector[cython.Py_UCS4]()
result: vector[cython.char]
if not cython.compiled:
result = vector()
result.reserve(100)
Expand All @@ -175,12 +171,12 @@ def parseString(self) -> cython.unicode:

self.ensure(self.current() == QUOTE)
self.nextWithoutCommentDetection()
unicode_obj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, result.data(), result.size())
return unicode_obj.decode('utf-8', errors='surrogateescape')

return PyUnicode_DecodeUTF8(result.data(), result.size(), 'surrogateescape')

@cython.cfunc
@cython.exceptval(check=False)
def guessExpression(self, s: cython.unicode):
def guessExpression(self, s: cython.bytes):
s_len: cython.Py_ssize_t
s = s.strip()
slen = len(s)
Expand All @@ -195,25 +191,25 @@ def guessExpression(self, s: cython.unicode):
try:
return float(s)
except ValueError:
return s.decode('utf-8', errors='surrogateescape')
return PyUnicode_DecodeUTF8(PyBytes_AS_STRING(s), PyBytes_GET_SIZE(s), 'surrogateescape')
else:
try:
return int(s)
except ValueError:
return s.decode('utf-8', errors='surrogateescape')
return PyUnicode_DecodeUTF8(PyBytes_AS_STRING(s), PyBytes_GET_SIZE(s), 'surrogateescape')

@cython.cfunc
@cython.exceptval(check=False)
def parseUnknownExpression(self):
pos: cython.Py_ssize_t
c: cython.Py_UCS4
c: cython.char

pos = self.currentPosition
while True:
if pos >= self.input_string_len:
self.ensure(pos < self.input_string_len) # Just to make it fail

c = PyUnicode_READ(self.data_kind, self.data, pos)
c = self.input_string[pos]
if c in b';},':
break

Expand All @@ -226,7 +222,7 @@ def parseUnknownExpression(self):

@cython.cfunc
def parseNonArrayPropertyValue(self):
current: cython.Py_UCS4 = self.current()
current: cython.char = self.current()
if current == CURLY_OPEN:
return self.parseArray()
elif current == QUOTE:
Expand All @@ -239,11 +235,11 @@ def parseNonArrayPropertyValue(self):
@cython.cfunc
@cython.inline
@cython.exceptval(check=False)
def isValidVarnameChar(self, c: cython.Py_UCS4) -> cython.bint:
def isValidVarnameChar(self, c: cython.char) -> cython.bint:
return c in b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.\\'

@cython.cfunc
def parsePropertyName(self) -> cython.unicode:
def parsePropertyName(self) -> cython.bytes:
start: cython.Py_ssize_t = self.currentPosition
stop: cython.Py_ssize_t = self.currentPosition + 1

Expand Down Expand Up @@ -303,11 +299,11 @@ def parseWhitespace(self) -> cython.void:
@cython.inline
@cython.exceptval(check=False)
def isWhitespace(self) -> cython.bint:
c: cython.Py_UCS4
c: cython.char
if self.input_string_len <= self.currentPosition:
return False

c = PyUnicode_READ(self.data_kind, self.data, self.currentPosition)
c = self.input_string[self.currentPosition]
return c in b' \t\r\n' or c < 32

@cython.cfunc
Expand Down Expand Up @@ -377,7 +373,7 @@ def parseProperty(self, context: dict) -> cython.void:
else:
raise ParseError('Unexpected value at pos {}'.format(self.currentPosition))

context[name.decode('utf-8', errors='surrogateescape')] = value
context[PyUnicode_DecodeUTF8(PyBytes_AS_STRING(name), PyBytes_GET_SIZE(name), 'surrogateescape')] = value

self.parseWhitespace()
self.ensure(self.current() == SEMICOLON)
Expand All @@ -392,39 +388,40 @@ def translateString(self, txt: str) -> str:

@cython.cfunc
def parseTranslationString(self):
result = []
result: vector[cython.char]
if not cython.compiled:
result = vector()
result.reserve(100)

assert self.current() == DOLLAR
self.next()

if self.input_string[self.currentPosition: self.currentPosition + 3] != STR:
raise ParseError('Invalid translation string beginning')

while self.currentPosition < self.input_string_len:
current: cython.Py_UCS4 = self.current()
current: cython.char = self.current()
if current in b';,}':
break
else:
if self.isWhitespace():
self.parseWhitespace()
break
else:
result.append(current)
result.push_back(current)
self.nextWithoutCommentDetection()

if self.currentPosition >= self.input_string_len or self.current() not in b';,}':
raise ParseError('Syntax error next translation string')

return self.translateString(bytes(result).decode('utf-8', errors='surrogateescape'))
return self.translateString(PyUnicode_DecodeUTF8(result.data(), result.size(), 'surrogateescape'))

def parse(self, raw, translations):
self.currentPosition = 0
self.input_string = raw
self.input_string = PyBytes_AsString(raw) # with error checking
self.input_string_len = len(raw)
self.translations = translations or {}

self.data = PyUnicode_DATA(self.input_string)
self.data_kind = PyUnicode_KIND(self.input_string)

result = {}

self.detectComment()
Expand Down
3 changes: 2 additions & 1 deletion testconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
print('Reading file...')
try:
contents = f.read()
contents_s = contents.decode('utf8', errors='surrogateescape')
# contents_s = contents.decode('utf8', errors='surrogateescape')
contents_s = contents

except UnicodeDecodeError as ex:
before = contents[:ex.start]
Expand Down

0 comments on commit 563f37e

Please sign in to comment.