Pure python cython implementation

overfl0 · May 10, 2023 · 563f37e · 563f37e
1 parent ca450ad
commit 563f37e
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 74 deletions.
diff --git a/armaclass/cython_stubs.py b/armaclass/cython_stubs.py
@@ -1,21 +1,22 @@
 from . import Shadow as cython
 
 PyUnicode_4BYTE_KIND = None
+cython.bytes = bytes
 
-def PyUnicode_KIND(data):
-    return None
+def PyBytes_AsString(b):
+    return b
 
 
-def PyUnicode_FromKindAndData(kind, data, size):
-    return data[:size]
+def PyBytes_AS_STRING(b):
+    return b
 
 
-def PyUnicode_READ(kind, data, pos):
-    return data[pos]
+def PyBytes_GET_SIZE(b):
+    return len(b)
 
 
-def PyUnicode_DATA(data):
-    return data
+def PyUnicode_DecodeUTF8(data, len, errors):
+    return data.decode('utf-8', errors=errors)
 
 
 class vector:

diff --git a/armaclass/parser.pxd b/armaclass/parser.pxd
@@ -1,20 +1,20 @@
-cdef Py_UCS4 QUOTE
-cdef Py_UCS4 SEMICOLON
-cdef Py_UCS4 COLON
-cdef Py_UCS4 EQUALS
-cdef Py_UCS4 CURLY_OPEN
-cdef Py_UCS4 CURLY_CLOSE
-cdef Py_UCS4 SQUARE_OPEN
-cdef Py_UCS4 SQUARE_CLOSE
-cdef Py_UCS4 COMMA
-cdef Py_UCS4 PLUS
-cdef Py_UCS4 MINUS
-cdef Py_UCS4 SLASH
-cdef Py_UCS4 DOLLAR
-cdef Py_UCS4 ASTERISK
-cdef Py_UCS4 NEWLINE
+cdef char QUOTE
+cdef char SEMICOLON
+cdef char COLON
+cdef char EQUALS
+cdef char CURLY_OPEN
+cdef char CURLY_CLOSE
+cdef char SQUARE_OPEN
+cdef char SQUARE_CLOSE
+cdef char COMMA
+cdef char PLUS
+cdef char MINUS
+cdef char SLASH
+cdef char DOLLAR
+cdef char ASTERISK
+cdef char NEWLINE
 
-cdef unicode NEWLINE_U
-cdef unicode END_COMMENT_U
-cdef unicode QUOTE_U
-cdef unicode STR
+cdef bytes NEWLINE_U
+cdef bytes END_COMMENT_U
+cdef bytes QUOTE_U
+cdef bytes STR
diff --git a/armaclass/parser.py b/armaclass/parser.py
@@ -4,18 +4,15 @@
     import cython
 except ModuleNotFoundError:
     from .cython_stubs import (cython,
-                               PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND, PyUnicode_DATA,
-                               PyUnicode_KIND, PyUnicode_READ,
+                               PyBytes_GET_SIZE, PyBytes_AS_STRING, PyBytes_AsString, PyUnicode_DecodeUTF8,
                                vector)
 
 if cython.compiled:
-    from cython.cimports.cpython import (PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND, PyUnicode_DATA,
-                                         PyUnicode_KIND, PyUnicode_READ)
+    from cython.cimports.cpython import PyBytes_GET_SIZE, PyBytes_AS_STRING, PyBytes_AsString, PyUnicode_DecodeUTF8
     from cython.cimports.libcpp.vector import vector
 else:
     from .cython_stubs import (cython,
-                               PyUnicode_FromKindAndData, PyUnicode_4BYTE_KIND, PyUnicode_DATA,
-                               PyUnicode_KIND, PyUnicode_READ,
+                               PyBytes_GET_SIZE, PyBytes_AS_STRING, PyBytes_AsString, PyUnicode_DecodeUTF8,
                                vector)
 
 QUOTE = ord('"')
@@ -49,13 +46,10 @@ class ParseError(RuntimeError):
 @cython.cclass
 class Parser:
     currentPosition: cython.Py_ssize_t
-    input_string: cython.unicode
+    input_string: cython.p_char
     input_string_len: cython.Py_ssize_t
     translations: dict
 
-    data: cython.p_void
-    data_kind: cython.int
-
     @cython.cfunc
     def ensure(self, condition: cython.bint, message='Error'):
         if condition:
@@ -66,6 +60,8 @@ def ensure(self, condition: cython.bint, message='Error'):
 
     @cython.cfunc
     @cython.inline
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
     @cython.exceptval(check=False)
     def detectComment(self) -> cython.void:
         indexCommentEnd: cython.Py_ssize_t
@@ -74,24 +70,24 @@ def detectComment(self) -> cython.void:
         if self.currentPosition >= self.input_string_len:
             return
 
-        if PyUnicode_READ(self.data_kind, self.data, self.currentPosition) == SLASH:
+        if self.input_string[self.currentPosition] == SLASH:
             if self.currentPosition + 1 >= self.input_string_len:
                 return
 
-            if PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == SLASH:
+            if self.input_string[self.currentPosition + 1] == SLASH:
                 indexOfLinefeed = self.input_string.find(NEWLINE_U, self.currentPosition)
                 if indexOfLinefeed == -1:
                     self.currentPosition = self.input_string_len
                 else:
                     self.currentPosition = indexOfLinefeed
-            elif PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == ASTERISK:
+            elif self.input_string[self.currentPosition + 1] == ASTERISK:
                 indexCommentEnd = self.input_string.find(END_COMMENT_U, self.currentPosition)
                 self.currentPosition = self.input_string_len if indexCommentEnd == -1 else indexCommentEnd + 2
 
     @cython.cfunc
     @cython.inline
     @cython.exceptval(check=False)
-    def next(self) -> cython.Py_UCS4:
+    def next(self) -> cython.char:
         self.currentPosition += 1
         self.detectComment()
         return self.current()
@@ -105,19 +101,19 @@ def nextWithoutCommentDetection(self) -> cython.void:
     @cython.cfunc
     @cython.inline
     @cython.exceptval(check=False)
-    def current(self) -> cython.Py_UCS4:
+    def current(self) -> cython.char:
         if self.currentPosition >= self.input_string_len:
             return -1
 
-        return PyUnicode_READ(self.data_kind, self.data, self.currentPosition)
+        return self.input_string[self.currentPosition]
 
     @cython.cfunc
     @cython.inline
     @cython.exceptval(check=False)
     def weHaveADoubleQuote(self) -> cython.bint:
         if self.input_string_len >= self.currentPosition + 2 and \
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition) == QUOTE and \
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == QUOTE:
+                self.input_string[self.currentPosition] == QUOTE and \
+                self.input_string[self.currentPosition + 1] == QUOTE:
             return True
         return False
 
@@ -127,12 +123,12 @@ def weHaveADoubleQuote(self) -> cython.bint:
     def weHaveAStringLineBreak(self) -> cython.bint:
         if (
                 self.input_string_len >= self.currentPosition + 6 and
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition) == QUOTE and
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 1) == ord(' ') and
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 2) == ord('\\') and
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 3) == ord('n') and
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 4) == ord(' ') and
-                PyUnicode_READ(self.data_kind, self.data, self.currentPosition + 5) == QUOTE
+                self.input_string[self.currentPosition] == QUOTE and
+                self.input_string[self.currentPosition + 1] == ord(' ') and
+                self.input_string[self.currentPosition + 2] == ord('\\') and
+                self.input_string[self.currentPosition + 3] == ord('n') and
+                self.input_string[self.currentPosition + 4] == ord(' ') and
+                self.input_string[self.currentPosition + 5] == QUOTE
         ):
             return True
         return False
@@ -148,7 +144,7 @@ def forwardToNextQuote(self) -> cython.void:
     @cython.cfunc
     @cython.inline
     def parseString(self) -> cython.unicode:
-        result: vector[cython.Py_UCS4]# = vector[cython.Py_UCS4]()
+        result: vector[cython.char]
         if not cython.compiled:
             result = vector()
         result.reserve(100)
@@ -175,12 +171,12 @@ def parseString(self) -> cython.unicode:
 
         self.ensure(self.current() == QUOTE)
         self.nextWithoutCommentDetection()
-        unicode_obj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, result.data(), result.size())
-        return unicode_obj.decode('utf-8', errors='surrogateescape')
+
+        return PyUnicode_DecodeUTF8(result.data(), result.size(), 'surrogateescape')
 
     @cython.cfunc
     @cython.exceptval(check=False)
-    def guessExpression(self, s: cython.unicode):
+    def guessExpression(self, s: cython.bytes):
         s_len: cython.Py_ssize_t
         s = s.strip()
         slen = len(s)
@@ -195,25 +191,25 @@ def guessExpression(self, s: cython.unicode):
             try:
                 return float(s)
             except ValueError:
-                return s.decode('utf-8', errors='surrogateescape')
+                return PyUnicode_DecodeUTF8(PyBytes_AS_STRING(s), PyBytes_GET_SIZE(s), 'surrogateescape')
         else:
             try:
                 return int(s)
             except ValueError:
-                return s.decode('utf-8', errors='surrogateescape')
+                return PyUnicode_DecodeUTF8(PyBytes_AS_STRING(s), PyBytes_GET_SIZE(s), 'surrogateescape')
 
     @cython.cfunc
     @cython.exceptval(check=False)
     def parseUnknownExpression(self):
         pos: cython.Py_ssize_t
-        c: cython.Py_UCS4
+        c: cython.char
 
         pos = self.currentPosition
         while True:
             if pos >= self.input_string_len:
                 self.ensure(pos < self.input_string_len)  # Just to make it fail
 
-            c = PyUnicode_READ(self.data_kind, self.data, pos)
+            c = self.input_string[pos]
             if c in b';},':
                 break
 
@@ -226,7 +222,7 @@ def parseUnknownExpression(self):
 
     @cython.cfunc
     def parseNonArrayPropertyValue(self):
-        current: cython.Py_UCS4 = self.current()
+        current: cython.char = self.current()
         if current == CURLY_OPEN:
             return self.parseArray()
         elif current == QUOTE:
@@ -239,11 +235,11 @@ def parseNonArrayPropertyValue(self):
     @cython.cfunc
     @cython.inline
     @cython.exceptval(check=False)
-    def isValidVarnameChar(self, c: cython.Py_UCS4) -> cython.bint:
+    def isValidVarnameChar(self, c: cython.char) -> cython.bint:
         return c in b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.\\'
 
     @cython.cfunc
-    def parsePropertyName(self) -> cython.unicode:
+    def parsePropertyName(self) -> cython.bytes:
         start: cython.Py_ssize_t = self.currentPosition
         stop: cython.Py_ssize_t = self.currentPosition + 1
 
@@ -303,11 +299,11 @@ def parseWhitespace(self) -> cython.void:
     @cython.inline
     @cython.exceptval(check=False)
     def isWhitespace(self) -> cython.bint:
-        c: cython.Py_UCS4
+        c: cython.char
         if self.input_string_len <= self.currentPosition:
             return False
 
-        c = PyUnicode_READ(self.data_kind, self.data, self.currentPosition)
+        c = self.input_string[self.currentPosition]
         return c in b' \t\r\n' or c < 32
 
     @cython.cfunc
@@ -377,7 +373,7 @@ def parseProperty(self, context: dict) -> cython.void:
         else:
             raise ParseError('Unexpected value at pos {}'.format(self.currentPosition))
 
-        context[name.decode('utf-8', errors='surrogateescape')] = value
+        context[PyUnicode_DecodeUTF8(PyBytes_AS_STRING(name), PyBytes_GET_SIZE(name), 'surrogateescape')] = value
 
         self.parseWhitespace()
         self.ensure(self.current() == SEMICOLON)
@@ -392,39 +388,40 @@ def translateString(self, txt: str) -> str:
 
     @cython.cfunc
     def parseTranslationString(self):
-        result = []
+        result: vector[cython.char]
+        if not cython.compiled:
+            result = vector()
+        result.reserve(100)
+
         assert self.current() == DOLLAR
         self.next()
 
         if self.input_string[self.currentPosition: self.currentPosition + 3] != STR:
             raise ParseError('Invalid translation string beginning')
 
         while self.currentPosition < self.input_string_len:
-            current: cython.Py_UCS4 = self.current()
+            current: cython.char = self.current()
             if current in b';,}':
                 break
             else:
                 if self.isWhitespace():
                     self.parseWhitespace()
                     break
                 else:
-                    result.append(current)
+                    result.push_back(current)
             self.nextWithoutCommentDetection()
 
         if self.currentPosition >= self.input_string_len or self.current() not in b';,}':
             raise ParseError('Syntax error next translation string')
 
-        return self.translateString(bytes(result).decode('utf-8', errors='surrogateescape'))
+        return self.translateString(PyUnicode_DecodeUTF8(result.data(), result.size(), 'surrogateescape'))
 
     def parse(self, raw, translations):
         self.currentPosition = 0
-        self.input_string = raw
+        self.input_string = PyBytes_AsString(raw)  # with error checking
         self.input_string_len = len(raw)
         self.translations = translations or {}
 
-        self.data = PyUnicode_DATA(self.input_string)
-        self.data_kind = PyUnicode_KIND(self.input_string)
-
         result = {}
 
         self.detectComment()

diff --git a/testconfig.py b/testconfig.py
@@ -24,7 +24,8 @@
     print('Reading file...')
     try:
         contents = f.read()
-        contents_s = contents.decode('utf8', errors='surrogateescape')
+        # contents_s = contents.decode('utf8', errors='surrogateescape')
+        contents_s = contents
 
     except UnicodeDecodeError as ex:
         before = contents[:ex.start]