forked from urvanov-ru/crayon-syntax-highlighter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclass-urvanov-syntax-highlighter-parser.php
397 lines (337 loc) · 11.7 KB
/
class-urvanov-syntax-highlighter-parser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
<?php
/**
* Parser Class
*
* @package Crayon Syntax Highlighter
* @author Fedor Urvanov, Aram Kocharyan
* @copyright Copyright 2013, Aram Kocharyan
* @link https://urvanov.ru
*/
defined( 'ABSPATH' ) || exit;
require_once 'class-urvanov-syntax-highlighter-global.php';
require_once URVANOV_SYNTAX_HIGHLIGHTER_LANGS_PHP;
/**
* Manages parsing the syntax for any given language, constructing the regex, and validating the elements.
* Class Urvanov_Syntax_Highlighter_Parser
*/
class Urvanov_Syntax_Highlighter_Parser {
/**
* Caase insensitive.
*/
const CASE_INSENSITIVE = 'CASE_INSENSITIVE';
/**
* Multi line.
*/
const MULTI_LINE = 'MULTI_LINE';
/**
* Single line.
*/
const SINGLE_LINE = 'SINGLE_LINE';
/**
* Allowed mixed.
*/
const ALLOW_MIXED = 'ALLOW_MIXED';
/**
* HTML Char.
*/
const HTML_CHAR = 'HTML_CHAR';
/**
* CHar RegEx.
*/
const HTML_CHAR_REGEX = '<|>|(&([\w-]+);?)|[ \t]+';
/**
* Highlighter element.
*/
const URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT = 'URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT';
/**
* Highlighter element RegEx
*/
const URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT_REGEX = '\{\{urvanov-syntax-highlighter-internal:[^\}]*\}\}';
/**
* RegEx capture.
*/
const URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT_REGEX_CAPTURE = '\{\{urvanov-syntax-highlighter-internal:([^\}]*)\}\}';
/**
* Modes.
*
* @var bool[]
*/
private static $modes = array(
self::CASE_INSENSITIVE => true,
self::MULTI_LINE => true,
self::SINGLE_LINE => true,
self::ALLOW_MIXED => true,
);
/**
* Urvanov_Syntax_Highlighter_Parser constructor.
*/
private function __construct() {}
/**
* Parse all languages stored in Urvanov_Syntax_Highlighter_Langs.
* Avoid using this unless you must list the details in language files for all languages.
*
* @return array Array of all loaded Urvanov_Syntax_Highlighter_Langs.
*/
public static function parse_all() {
$langs = Urvanov_Syntax_Highlighter_Resources::langs()->get();
if ( empty( $langs ) ) {
return false;
}
foreach ( $langs as $lang ) {
self::parse( $lang->id() );
}
return $langs;
}
/**
* Read a syntax file and parse the regex rules within it, this may require several other
* files containing lists of keywords and such to be read. Updates the parsed elements and
* regex in the Urvanov_Syntax_Highlighter_Lang with the given $id.
*
* @param mixed $id ID.
*
* @return false|void
*/
public static function parse( $id ) {
// Verify the language is loaded and has not been parsed before.
$lang = Urvanov_Syntax_Highlighter_Resources::langs()->get( $id );
if ( ! $lang ) {
UrvanovSyntaxHighlighterLog::syslog( "The language with id '$id' was not loaded and could not be parsed." );
return false;
} elseif ( $lang->is_parsed() ) {
return;
}
// Read language file.
$path = Urvanov_Syntax_Highlighter_Resources::langs()->path( $id );
UrvanovSyntaxHighlighterLog::debug( 'Parsing language ' . $path );
$file = UrvanovSyntaxHighlighterUtil::lines( $path, 'wcs' );
if ( false === $file ) {
UrvanovSyntaxHighlighterLog::debug( 'Parsing failed ' . $path );
return false;
}
// Extract the language name.
$name_pattern = '#^[ \t]*name[ \t]+([^\r\n]+)[ \t]*#mi';
preg_match( $name_pattern, $file, $name );
if ( count( $name ) > 1 ) {
$name = $name[1];
$lang->name( $name );
$file = preg_replace( $name_pattern, '', $file );
} else {
$name = $lang->id();
}
// Extract the language version.
$version_pattern = '#^[ \t]*version[ \t]+([^\r\n]+)[ \t]*#mi';
preg_match( $version_pattern, $file, $version );
if ( count( $version ) > 1 ) {
$version = $version[1];
$lang->version( $version );
$file = preg_replace( $version_pattern, '', $file );
}
// Extract the modes.
$mode_pattern = '#^[ \t]*(' . implode( '|', array_keys( self::$modes ) ) . ')[ \t]+(?:=[ \t]*)?([^\r\n]+)[ \t]*#mi';
preg_match_all( $mode_pattern, $file, $mode_matches );
if ( count( $mode_matches ) === 3 ) {
$count = count( $mode_matches[0] );
for ( $i = 0; $i < $count; $i ++ ) {
$lang->mode( $mode_matches[1][ $i ], $mode_matches[2][ $i ] );
}
$file = preg_replace( $mode_pattern, '', $file );
}
/* Add reserved Crayon element. This is used by Crayon internally. */
$urvanov_syntax_highlighter_element = new Urvanov_Syntax_Highlighter_Element( self::URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT, $path, self::URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT_REGEX );
$lang->element( self::URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT, $urvanov_syntax_highlighter_element );
// Extract elements, classes and regex.
$pattern = '#^[ \t]*([\w:]+)[ \t]+(?:\[([\w\t ]*)\][ \t]+)?([^\r\n]+)[ \t]*#m';
preg_match_all( $pattern, $file, $matches );
if ( ! empty( $matches[0] ) ) {
$elements = $matches[1];
$classes = $matches[2];
$regexes = $matches[3];
} else {
UrvanovSyntaxHighlighterLog::syslog( "No regex patterns and/or elements were parsed from language file at '$path'." );
}
// Remember state in case we encounter catchable exceptions.
$error = false;
$count = count( $matches[0] );
for ( $i = 0; $i < $count; $i ++ ) {
// References.
$name = &$elements[ $i ];
$class = &$classes[ $i ];
$regex = &$regexes[ $i ];
$name = trim( strtoupper( $name ) );
// Ensure both the element and regex are valid.
if ( empty( $name ) || empty( $regex ) ) {
UrvanovSyntaxHighlighterLog::syslog( "Element(s) and/or regex(es) are missing in '$path'." );
$error = true;
continue;
}
// Look for fallback element.
$pieces = explode( ':', $name );
if ( 2 === count( $pieces ) ) {
$name = $pieces[0];
$fallback = $pieces[1];
} elseif ( 1 === count( $pieces ) ) {
$name = $pieces[0];
$fallback = '';
} else {
UrvanovSyntaxHighlighterLog::syslog( "Too many colons found in element name '$name' in '$path'" );
$error = true;
continue;
}
// Create a new Urvanov_Syntax_Highlighter_Element.
$element = new Urvanov_Syntax_Highlighter_Element( $name, $path, '' );
$element->fallback( $fallback );
if ( ! empty( $class ) ) {
// Avoid setting known css to blank.
$element->css( $class );
}
if ( $element->regex( $regex ) === false ) {
$error = true;
continue;
}
// Add the regex to the element.
$lang->element( $name, $element );
$state = $error ? Urvanov_Syntax_Highlighter_Lang::PARSED_ERRORS : Urvanov_Syntax_Highlighter_Lang::PARSED_SUCCESS;
$lang->state( $state );
}
/**
* Prevents < > and other html entities from being printed as is, which could lead to actual html tags
* from the printed code appearing on the page - not good. This can also act to color any HTML entities
* that are not picked up by previously defined elements.
*/
$html = new Urvanov_Syntax_Highlighter_Element( self::HTML_CHAR, $path, self::HTML_CHAR_REGEX );
$lang->element( self::HTML_CHAR, $html );
}
/**
* Validates regex and accesses data stored in a Urvanov_Syntax_Highlighter_Element.
*
* @param string $regex RegEx.
* @param object $element Element.
*
* @return array|false|string|string[]|null
*/
public static function validate_regex( string $regex, $element = null ) {
if ( is_string( $regex ) && get_class( $element ) === URVANOV_SYNTAX_HIGHLIGHTER_ELEMENT_CLASS ) {
// If the (?alt) tag has been used, insert the file into the regex.
$file = self::regex_match( '#\(\?alt:(.+?)\)#', $regex );
if ( 2 === count( $file ) ) {
// Element 0 has full match, 1 has captured groups.
$count = count( $file[1] );
for ( $i = 0; $i < $count; $i ++ ) {
$file_lines = UrvanovSyntaxHighlighterUtil::lines( dirname( $element->path() ) . Urvanov_Syntax_Highlighter_Global::fix_s() . $file[1][ $i ], 'rcwh' );
if ( false !== $file_lines ) {
$file_lines = implode( '|', $file_lines );
// If any spaces exist, treat them as whitespace.
$file_lines = preg_replace( '#[ \t]+#msi', '\s+', $file_lines );
$regex = str_replace( $file[0][ $i ], "(?:$file_lines)", $regex );
} else {
UrvanovSyntaxHighlighterLog::syslog( "Parsing of '{$element->path()}' failed, an (?alt) tag failed for the element '{$element->name()}'" );
return false;
}
}
}
// If the (?default:element) function is used, replace the regex with the default, if exists.
$def = self::regex_match( '#\(\?default(?:\:(\w+))?\)#', $regex );
if ( 2 === count( $def ) ) {
// Load default language.
$default = Urvanov_Syntax_Highlighter_Resources::langs()->get( Urvanov_Syntax_Highlighter_Langs::DEFAULT_LANG );
// If default has not been loaded, we can't use it, skip the element.
if ( ! $default ) {
UrvanovSyntaxHighlighterLog::syslog( "Could not use default regex in the element '{$element->name()}' in '{$element->path()}'" );
return false;
}
$count = count( $def[1] );
for ( $i = 0; $i < $count; $i ++ ) {
// If an element has been provided.
$element_name = ( ! empty( $def[1][ $i ] ) ) ? $def[1][ $i ] : $element->name();
$default_element = $default->element( $element_name );
if ( false !== $default_element ) {
$regex = str_replace( $def[0][ $i ], '(?:' . $default_element->regex() . ')', $regex );
} else {
UrvanovSyntaxHighlighterLog::syslog( "The language at '{$element->path()}' referred to the Default Language regex for element '{$element->name()}', which did not exist." );
if ( URVANOV_SYNTAX_HIGHLIGHTER_DEBUG ) {
UrvanovSyntaxHighlighterLog::syslog( 'Default language URL: ' . Urvanov_Syntax_Highlighter_Resources::langs()->url( Urvanov_Syntax_Highlighter_Langs::DEFAULT_LANG ) );
UrvanovSyntaxHighlighterLog::syslog( 'Default language Path: ' . Urvanov_Syntax_Highlighter_Resources::langs()->path( Urvanov_Syntax_Highlighter_Langs::DEFAULT_LANG ) );
}
return false;
}
}
}
// If the (?html) tag is used, escape characters in html (<, > and &).
$html = self::regex_match( '#\(\?html:(.+?)\)#', $regex );
if ( 2 === count( $html ) ) {
$count = count( $html[1] );
for ( $i = 0; $i < $count; $i ++ ) {
$regex = str_replace( $html[0][ $i ], htmlentities( $html[1][ $i ] ), $regex );
}
}
// Ensure all parenthesis are atomic to avoid conflicting with element matches.
$regex = UrvanovSyntaxHighlighterUtil::esc_atomic( $regex );
// Escape #, this is our delimiter.
$regex = UrvanovSyntaxHighlighterUtil::esc_hash( $regex );
// Test if regex is valid.
if ( false === preg_match( "#$regex#", '' ) ) {
UrvanovSyntaxHighlighterLog::syslog( "The regex for the element '{$element->name()}' in '{$element->path()}' is not valid." );
return false;
}
return $regex;
} else {
return '';
}
}
/**
* Validate CSS.
*
* @param string $css CSS.
*
* @return string
*/
public static function validate_css( string $css ): string {
if ( is_string( $css ) ) {
// Remove dots in CSS class and convert to lowercase.
$css = str_replace( '.', '', $css );
$css = strtolower( $css );
$css = explode( ' ', $css );
$css_str = '';
foreach ( $css as $c ) {
if ( ! empty( $c ) ) {
$css_str .= $c . ' ';
}
}
return trim( $css_str );
} else {
return '';
}
}
/**
* RegEx match.
*
* @param string $pattern RegEx.
* @param string $subject Subject.
*
* @return array|mixed
*/
public static function regex_match( string $pattern, string $subject ) {
if ( preg_match_all( $pattern, $subject, $matches ) ) {
return $matches;
}
return array();
}
/**
* Modes.
*
* @return bool[]
*/
public static function modes(): array {
return self::$modes;
}
/**
* Is mode.
*
* @param string $name Name.
*
* @return bool
*/
public static function is_mode( string $name ): bool {
return is_string( $name ) && array_key_exists( $name, self::$modes );
}
}