forked from koreader/koreader-base
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xtext.cpp
2374 lines (2186 loc) · 109 KB
/
xtext.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// xtext.cpp
// Lua interface to wrap a utf8 string into a XText object
// that provides various text shaping and layout methods
// with the help of Fribidi, Harfbuzz and libunibreak.
// We do many things similarly to how they are done in crengine,
// and took and adapted much code from it.
// For many links and notes about the concepts and libraries used,
// see: https://github.com/koreader/crengine/issues/307
extern "C"
{
#include <lua.h>
#include <lauxlib.h>
#include <lualib.h>
#include "xtext.h"
}
// Harfbuzz
#include <hb.h>
#include <hb-ft.h>
// FriBiDi
#include <fribidi/fribidi.h>
// libunibreak
#include <wordbreak.h>
#include <linebreak.h>
// linebreakdef.h is not wrapped by this, unlike linebreak.h
// (not wrapping it results in "undefined symbol" with the
// original function name kinda obfuscated)
#ifdef __cplusplus
extern "C" {
#endif
#include <linebreakdef.h>
#ifdef __cplusplus
}
#endif
// luajit doesn't provide lua_set/getuservalue (unless compiled
// with LUAJIT_ENABLE_LUA52COMPAT) but these are equivalents
// for our purpose
#if LUA_VERSION_NUM < 502
#define lua_setuservalue lua_setfenv
#define lua_getuservalue lua_getfenv
#endif
// Some names, as they should be known to Lua
#define XTEXT_LIBNAME "xtext"
#define XTEXT_METATABLE_NAME "luaL_XText"
#define XTEXT_HB_FONT_DATA_METATABLE_NAME "luaL_XText_HB_Font_Data"
#define XTEXT_LUA_HB_FONT_DATA_TABLE_KEY_NAME "_hb_font_data"
#define XTEXT_LUA_FONT_GETFONT_CALLBACK_NAME "getFallbackFont"
// Max unicode chars per shaped (visual) line
#define MAX_LINE_CHARS 4096
// Max returned glyphs per line (usually less glyphs than chars,
// but allow for more just in case some fonts don't come with many
// glyphs and combine many diacritics to form a unicode char).
#define MAX_LINE_GLYPHS 2*MAX_LINE_CHARS
// Max number of fonts (main + fallbacks)
// (main + 15 fallback fonts should be enough)
#define MAX_FONT_NUM 16
#define NOT_MEASURED INT_MIN
#define REPLACEMENT_CHAR 0xFFFD
#define ELLIPSIS_CHAR 0x2026
// Helpers with font metrics (units are 1/64 px)
// #define FONT_METRIC_FLOOR(x) ((x) & -64)
// #define FONT_METRIC_CEIL(x) (((x)+63) & -64)
// #define FONT_METRIC_ROUND(x) (((x)+32) & -64)
// #define FONT_METRIC_TRUNC(x) ((x) >> 6)
#define FONT_METRIC_TO_PX(x) (((x)+32) >> 6) // ROUND + TRUNC
// Uncomment for debugging text measurement and line shaping:
// #define DEBUG_MEASURE_TEXT
// #define DEBUG_SHAPE_LINE
// ==============================================
// Utility functions
inline bool is_unicodepoint_rtl(uint32_t c) {
// Try to detect if this unicode codepoint is a RTL char
// Looking at fribidi/lib/bidi-type.tab.i and its rules for tagging
// a char as RTL, only the following ranges will trigger it:
// 0590>08FF Hebrew, Arabic, Syriac, Thaana, Nko, Samaritan...
// 200F 202B Right-To-Left mark/embedding control chars
// 202E 2067 Right-To-Left override/isolate control chars
// FB1D>FDFF Hebrew and Arabic presentation forms
// FE70>FEFF Arabic presentation forms
// 10800>10FFF Other rare scripts possibly RTL
// 1E800>1EEBB Other rare scripts possibly RTL
// (There may be LTR chars in these ranges, but we're ok with false
// positives: we'll invoke fribidi, which will say there's no bidi.)
// Try to balance the searches
bool is_rtl = false;
if ( c >= 0x0590 ) {
if ( c <= 0x2067 ) {
if ( c <= 0x08FF ) is_rtl = true;
else if ( c >= 0x200F ) {
if ( c == 0x200F || c == 0x202B || c == 0x202E || c == 0x2067 ) is_rtl = true;
}
}
else if ( c >= 0xFB1D ) {
if ( c <= 0xFDFF ) is_rtl = true;
else if ( c <= 0xFEFF ) {
if ( c >= 0xFE70) is_rtl = true;
}
else if ( c <= 0x1EEBB ) {
if (c >= 0x1E800) is_rtl = true;
else if ( c <= 0x10FFF && c >= 0x10800 ) is_rtl = true;
}
}
}
return is_rtl;
}
// Fribidi provides fribidi_charset_to_unicode(FRIBIDI_CHAR_SET_UTF8,...)
// but it expects valid utf8, and we want to support broken UTF-8 and WTF-8.
// So we implement Utf8ToUnicode(), which can be called twice:
// - once with dst=NULL, to quickly count the number of Unicode chars,
// - then with a non-null dst (malloc'ed to the previously obtained size)
// to decode and fill it with the Unicode chars.
// adapted from crengine/src/lvstring.cpp
#define HEAD_CHECK(mask, expect) ((s[0] & mask) == expect)
#define HEAD_BYTE(mask, shift) (((uint32_t)(s[0]) & mask) << shift)
#define CONT_BYTE(index, shift) (((uint32_t)(s[index]) & 0x3F) << shift)
#define HAS_FOLLOWUP(n) (s+n < ends)
#define IS_FOLLOWING(index) ((s[index] & 0xC0) == 0x80)
int Utf8ToUnicode(const char * src, int srclen, uint32_t * dst, int dstlen, bool &is_valid, bool &has_rtl)
{
is_valid = true; // until invalid found
// Trust the provided has_rtl, and avoid expensive check if provided as true
// has_rtl = false; // until RTL found
const char * s = src;
const char * ends = s + srclen;
bool do_decode = false; // otherwise, only count
int ucount = 0; // nb of unicode char found
uint32_t * p = NULL;
uint32_t * endp = NULL;
if ( dst != NULL ) {
do_decode = true;
p = dst;
endp = p + dstlen;
}
while ( s < ends ) {
if ( do_decode && p >= endp ) {
// safety check: avoid writing outside what's been allocated
break;
}
bool valid = false;
if ( HEAD_CHECK(0x80, 0) ) {
if ( do_decode )
*p = (uint32_t)(*s);
s++;
valid = true;
}
else if ( HEAD_CHECK(0xE0, 0xC0) ) {
if ( HAS_FOLLOWUP(1) && IS_FOLLOWING(1) ) {
if ( do_decode )
*p = HEAD_BYTE(0x1F, 6) | CONT_BYTE(1,0);
s += 2;
valid = true;
}
}
else if ( HEAD_CHECK(0xF0, 0xE0) ) {
if ( HAS_FOLLOWUP(2) && IS_FOLLOWING(1) && IS_FOLLOWING(2) ) {
if ( do_decode )
*p = HEAD_BYTE(0x0F, 12) | CONT_BYTE(1,6) | CONT_BYTE(2,0);
s += 3;
valid = true;
// We don't check for WTF-8 when counting, but only when decoding.
// (We may then get a string a bit smaller that what was allocated, but well...)
if ( do_decode ) {
// Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
// a superset of UTF-8, that includes UTF-16 surrogates
// in UTF-8 bytes (forbidden in well-formed UTF-8).
// Also see:
// https://unicodebook.readthedocs.io/issues.html#non-strict-utf-8-decoder-overlong-byte-sequences-and-surrogates
// https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs
// We may get them from JSON encoded strings, when the JSON
// decoder does not decode them correctly (in JSON, high codepoints can't be
// directly encoded, and are so encoded with the help of such surrogates.)
if ( *p >= 0xD800 && *p <= 0xDBFF && HAS_FOLLOWUP(2) ) {
// What we wrote is a high surrogate, and there's a possible low surrogate following
if ( HEAD_CHECK(0xF0, 0xE0) && IS_FOLLOWING(1) && IS_FOLLOWING(2) ) { // is a valid 3-bytes sequence
uint32_t next = HEAD_BYTE(0x0F, 12) | CONT_BYTE(1,6) | CONT_BYTE(2,0);
if (next >= 0xDC00 && next <= 0xDFFF) { // is a low surrogate: valid surrogates sequence
// Override what we wrote with the codepoint for this high+low surrogates sequence
*p = 0x10000 + ((*p & 0x3FF)<<10) + (next & 0x3FF);
s += 3;
}
}
}
// todo: deal with invalide surrotage sequences
}
}
}
else if ( HEAD_CHECK(0xF8, 0xF0) ) {
if ( HAS_FOLLOWUP(3) && IS_FOLLOWING(1) && IS_FOLLOWING(2) && IS_FOLLOWING(3) ) {
if ( do_decode )
*p = HEAD_BYTE(0x07, 18) | CONT_BYTE(1,12) | CONT_BYTE(2,6) | CONT_BYTE(3,0);
s += 4;
valid = true;
}
}
// else: invalid first byte in UTF-8 sequence
if ( !valid ) {
if (do_decode)
*p = REPLACEMENT_CHAR;
s++;
is_valid = false;
}
if ( do_decode ) {
// Try to detect if we have RTL chars, so that if we don't have any,
// we don't need to invoke expensive fribidi processing.
if ( !has_rtl )
has_rtl = is_unicodepoint_rtl(*p);
p++;
}
ucount++;
}
return ucount;
}
// ==============================================
// Flags, data structures, and global variables
#define HINT_DIRECTION_IS_RTL 0x0001 /// segment direction is RTL
#define HINT_BEGINS_PARAGRAPH 0x0002 /// segment is at start of paragraph
#define HINT_ENDS_PARAGRAPH 0x0004 /// segment is at end of paragraph
#define CHAR_CAN_WRAP_AFTER 0x0001
#define CHAR_MUST_BREAK_AFTER 0x0002
#define CHAR_SKIP_ON_BREAK 0x0004
#define CHAR_CAN_EXTEND_WIDTH 0x0008
#define CHAR_CAN_EXTEND_WIDTH_FALLBACK 0x0010 // Fallback if no space: extend CJK chars
#define CHAR_IS_CLUSTER_TAIL 0x0020
#define CHAR_IS_RTL 0x0040
#define CHAR_SCRIPT_CHANGE 0x0080
#define CHAR_IS_PARA_START 0x0100
#define CHAR_IS_PARA_END 0x0200
#define CHAR_PARA_IS_RTL 0x0400 /// to know the line with this char is part
/// of a paragraph with main dir RTL
#define CHAR_IS_TAB 0x1000 /// char is '\t'
// Info, after measure(), about each m_text char
typedef struct {
unsigned short flags;
signed short width;
} xtext_charinfo_t;
// Glyph info when shaping a line (to be returned to Lua as a table of tables)
// (16 bytes, making our static s_shape_result[MAX_LINE_GLYPHS] a 128Kb buffer)
typedef struct {
int text_index; // original index in m_text
uint16_t glyph; // glyph index in font
unsigned char font_num;
unsigned is_rtl:1;
unsigned can_extend:1;
unsigned can_extend_fallback:1;
unsigned is_tab:1;
unsigned _unused:4;
signed short x_advance;
signed short x_offset;
signed short y_offset;
unsigned char cluster_len;
unsigned is_cluster_start:1;
} xtext_shapeinfo_t;
// Holder of HB data structures per font, to be stored as a userdata
// in the Lua font table
typedef struct {
hb_font_t * hb_font;
hb_buffer_t * hb_buffer;
hb_feature_t * hb_features;
int hb_features_nb;
} xtext_hb_font_data;
// Global direction and language
static bool default_para_direction_rtl = false;
static char * default_lang = NULL;
static hb_language_t default_lang_hb_language = HB_LANGUAGE_INVALID;
// ==============================================
// Our main class
// (We would have liked to have it pure C++, but we do use and push
// things to the Lua stack, to avoid some indirection and overhead).
class XText {
private:
// Shared by all XText instances. Should not be used
// across calls to shapeLine()
static xtext_shapeinfo_t s_shape_result[MAX_LINE_GLYPHS];
static bool s_libunibreak_init_done;
public:
lua_State * m_L; // updated by each Lua method proxy
int m_length; // nb of unicode codepoints
bool m_no_longer_usable; // to prevent using it between dealloc & Lua gc
bool m_is_valid; // input was valid UTF-8
bool m_is_measured;
bool m_para_direction_rtl; // paragraph direction
bool m_auto_para_direction; // auto-detect paragraph direction
bool m_has_rtl;
bool m_has_bidi;
bool m_has_multiple_scripts; // true when multiple unicode scripts detected
char * m_lang;
hb_language_t m_hb_language;
int m_width; // measured full width
uint32_t * m_text; // array of unicode chars
xtext_charinfo_t * m_charinfo; // info about each of these unicode chars
FriBidiCharType * m_bidi_ctypes; // FriBiDi internal helper structures
FriBidiBracketType * m_bidi_btypes;
FriBidiLevel * m_bidi_levels;
XText()
:m_L(NULL)
,m_length(0)
,m_no_longer_usable(false)
,m_is_valid(false)
,m_is_measured(false)
,m_para_direction_rtl(false)
,m_auto_para_direction(false)
,m_has_rtl(false)
,m_has_bidi(false)
,m_has_multiple_scripts(false)
,m_lang(NULL)
,m_hb_language(HB_LANGUAGE_INVALID)
,m_width(NOT_MEASURED)
,m_text(NULL)
,m_charinfo(NULL)
,m_bidi_ctypes(NULL)
,m_bidi_btypes(NULL)
,m_bidi_levels(NULL)
{
// printf("XText created\n");
// printf("%ld\n", sizeof(xtext_shapeinfo_t));
}
~XText() {
deallocate();
// printf("XText destroyed\n");
}
void allocate() {
m_charinfo = (xtext_charinfo_t *)calloc(m_length, sizeof(*m_charinfo)); // set all flags to 0
if ( m_has_rtl ) {
m_bidi_ctypes = (FriBidiCharType *)malloc(m_length * sizeof(*m_bidi_ctypes));
m_bidi_btypes = (FriBidiBracketType *)malloc(m_length * sizeof(*m_bidi_btypes));
m_bidi_levels = (FriBidiLevel *)malloc(m_length * sizeof(*m_bidi_levels));
}
}
void deallocate() {
if (m_text) { free(m_text); m_text = NULL; }
if (m_charinfo) { free(m_charinfo); m_charinfo = NULL; }
if (m_bidi_ctypes) { free(m_bidi_ctypes); m_bidi_ctypes = NULL; }
if (m_bidi_btypes) { free(m_bidi_btypes); m_bidi_btypes = NULL; }
if (m_bidi_levels) { free(m_bidi_levels); m_bidi_levels = NULL; }
if (m_lang) { delete[] m_lang; m_lang = NULL; }
m_no_longer_usable = true;
}
void setLanguage(const char * lang) {
m_lang = new char[strlen(lang)+1];
strcpy(m_lang, lang);
m_hb_language = hb_language_from_string(m_lang, -1);
}
// Get UTF-32 m_text from the provided UTF-8
void setTextFromUTF8String(const char * utf8_text, int utf8_len) {
// We call Utf8ToUnicode() twice: a 1st phase to quickly
// count the number of unicode codepoints, before allocating m_text,
// and a 2nd to actually do the conversion and fill m_text.
m_length = Utf8ToUnicode(utf8_text, utf8_len, NULL, 0, m_is_valid, m_has_rtl);
m_text = (uint32_t *)malloc(m_length * sizeof(*m_text));
// m_has_rtl is only detected in the 2nd phase.
// If m_para_direction_rtl is true, set m_has_rtl=true in all case
// to force checkBidi(), and avoid some work in Utf8ToUnicode().
m_has_rtl = false;
if ( m_para_direction_rtl )
m_has_rtl = true;
m_length = Utf8ToUnicode(utf8_text, utf8_len, m_text, m_length, m_is_valid, m_has_rtl);
}
// Get UTF-32 m_text from a Lua array of individual UTF-8 strings,
// as made by frontend/util.lua util.splitToChars(text) and
// hold as InputType.charlist, which is given to TextBoxWidget.
// We need this because:
// There are multiple ways to handle invalid UTF-8 (like WTF-8,
// and whether 1 replacement char per invalid byte or per sequence
// of invalid bytes).
// Our setTextFromUTF8String() may not always give a m_text
// equivalent to InputType.charlist - but we need them to be sync'ed
// for correct cursor positioning and text insertion/deletion.
// So, we allow XText to handle such input: this avoid having to sync
// both utf8 decoding algorithms (but we can aim later at having
// a single good one).
void setTextFromUTF8CharsLuaArray(lua_State * L, int n) {
m_length = (int) lua_objlen(L, n); // NOTE: size_t -> int, as that's what both FriBidi & HarfBuzz expect.
m_text = (uint32_t *)malloc(m_length * sizeof(*m_text));
m_is_valid = true; // assume it is valid if coming from Lua array
m_has_rtl = false;
// If m_para_direction_rtl is true, set m_has_rtl=true in all case
// to force checkBidi(), and avoid is_unicodepoint_rtl() check below.
if ( m_para_direction_rtl )
m_has_rtl = true;
for (int i = 0; i < m_length; i++) {
lua_rawgeti(L, n, i+1); // (Lua indices start at 1)
size_t len;
const unsigned char * s = (const unsigned char*) luaL_checklstring(L, -1, &len);
lua_pop ( L, 1 ); // clean stack
// Should be similar to base/util.lua util.utf8charcode(charstring)
uint32_t u;
if (len == 1) {
u = s[0] & 0x7F;
}
else if (len == 2) {
u = ((s[0] & 0x1F)<<6) + (s[1] & 0x3F);
}
else if (len == 3) {
u = ((s[0] & 0x0F)<<12) + ((s[1] & 0x3F)<<6) + (s[2] & 0x3F);
}
else if (len == 4) {
u = ((s[0] & 0x07)<<18) + ((s[1] & 0x3F)<<12) + + ((s[2] & 0x3F)<<6) + (s[3] & 0x3F);
}
else {
u = REPLACEMENT_CHAR;
}
m_text[i] = u;
if ( !m_has_rtl && is_unicodepoint_rtl(u) )
m_has_rtl = true;
}
}
void checkBidi() {
if ( !m_has_rtl ) // No need for expensive bidi work
return;
FriBidiParType specified_para_bidi_type;
if ( m_auto_para_direction) {
if ( m_para_direction_rtl )
specified_para_bidi_type = FRIBIDI_PAR_WRTL; // Weak RTL
else
specified_para_bidi_type = FRIBIDI_PAR_WLTR; // Weak LTR
}
else {
if ( m_para_direction_rtl )
specified_para_bidi_type = FRIBIDI_PAR_RTL; // Strong RTL
else
specified_para_bidi_type = FRIBIDI_PAR_LTR; // Strong LTR
}
// Compute bidi levels
fribidi_get_bidi_types((const FriBidiChar*)m_text, m_length, m_bidi_ctypes);
fribidi_get_bracket_types((const FriBidiChar*)m_text, m_length, m_bidi_ctypes, m_bidi_btypes);
// We would have simply done:
// int max_level = fribidi_get_par_embedding_levels_ex(m_bidi_ctypes, m_bidi_btypes,
// m_length, (FriBidiParType*)&m_para_bidi_type, m_bidi_levels);
// But unfortunately, fribidi_get_par_embedding_levels_ex() only works on a single
// paragraph, and will set bogus levels for the text following the
// first \n (or other Unicode Block Separators, BS).
// FriBiDi expects us to work only on individual paragraphs. But we
// still want to process the whole text here so that we're done with it.
// So, split on BS and call fribidi_get_par_embedding_levels_ex() on
// each segment - hoping doing it that way is OK...
int max_level = 0;
int s_start = 0;
int i = 0;
while ( i <= m_length ) {
if ( i == m_length || m_bidi_ctypes[i] == FRIBIDI_TYPE_BS ) {
int s_length = i - s_start;
if (i < m_length)
s_length += 1; // include BS at i in segment
FriBidiParType para_bidi_type = specified_para_bidi_type;
FriBidiCharType * bidi_ctypes = (FriBidiCharType *) (m_bidi_ctypes + s_start);
FriBidiBracketType * bidi_btypes = (FriBidiBracketType *)(m_bidi_btypes + s_start);
FriBidiLevel * bidi_levels = (FriBidiLevel *) (m_bidi_levels + s_start);
int this_max_level = fribidi_get_par_embedding_levels_ex(bidi_ctypes, bidi_btypes,
s_length, ¶_bidi_type, bidi_levels);
/* To see resulting bidi levels:
printf("par_type %d , max_level %d\n", para_bidi_type, this_max_level);
for (int j=s_start; j<i; j++)
printf("%x %c %d\n", m_text[j], m_text[j], m_bidi_levels[j]);
*/
if ( this_max_level > max_level )
max_level = this_max_level;
// we set a flag on all chars part of this segment so we can know what
// is the paragraph direction of the paragraph this char is in.
if ( para_bidi_type == FRIBIDI_PAR_RTL || para_bidi_type == FRIBIDI_PAR_WRTL ) {
for ( int j=s_start; j<i; j++ ) {
m_charinfo[j].flags |= CHAR_PARA_IS_RTL;
}
// Also set it on the \n/FRIBIDI_TYPE_BS char
if (i < m_length) {
m_charinfo[i].flags |= CHAR_PARA_IS_RTL;
}
}
s_start = i+1;
}
i++;
}
// If computed max level == 1, we are in plain and only LTR,
// so no need for more bidi work later.
if ( max_level > 1 )
m_has_bidi = true;
}
// Get HB font data structures for font #num (create them and store them in the
// Lua font object, or get the previously created and stored ones)
// This must be a method of our XText object, as it uses the uservalue that has
// been associated with the userdata that is wrapping this XText instance.
xtext_hb_font_data * getHbFontData(int num) {
if ( num > MAX_FONT_NUM )
return NULL;
// This uses the stack for C <-> Lua interaction, but we should put this
// stack back in its original state, as it may carry additional arguments
// to the original function that was called.
int stack_orig_top = lua_gettop(m_L);
// The uservalue (the Lua font face_obj table) has been put at 1 on the stack
// by check_XText().
// Get the Lua font table for fallback font #num, by calling
// the Lua callback function: font.getFallbackFont(num).
lua_getfield(m_L, 1, XTEXT_LUA_FONT_GETFONT_CALLBACK_NAME);
lua_pushinteger(m_L, num);
lua_pcall(m_L, 1, 1, 0); // 1 argument, 1 returned value
if ( !lua_istable(m_L, -1) ) { // No #num font (we got "false")
lua_settop(m_L, stack_orig_top); // restore stack / drop our added work stuff
return NULL;
}
// We have a font, we'll be able to return something.
xtext_hb_font_data * hb_data;
// We got our font table. See if we already have the hb stuff stored
// as a userdata under the key '_hb_font_data'
lua_getfield(m_L, -1, XTEXT_LUA_HB_FONT_DATA_TABLE_KEY_NAME);
if ( lua_isuserdata(m_L, -1) ) {
// We do: just return the pointer to it (that we stored as the userdata)
hb_data = (xtext_hb_font_data *)luaL_checkudata(m_L, -1, XTEXT_HB_FONT_DATA_METATABLE_NAME);
lua_settop(m_L, stack_orig_top); // restore stack / drop our added work stuff
return hb_data;
}
lua_pop(m_L, 1); // remove nil
// Not previously stored: we have to create it and store it
// Get the 'ftface' Freetype FFI wrapped object
lua_getfield(m_L, -1, "ftface");
// printf("face type: %d %s\n", lua_type(m_L, -1), lua_typename(m_L, lua_type(m_L, -1)));
// We expect it to be a luajit ffi cdata, but the C API does not have a #define for
// that type. But it looks like its value is higher than the greatest LUA_T* type.
if ( lua_type(m_L, -1) <= LUA_TTHREAD ) {// Higher plain Lua datatype (lua.h)
luaL_typerror(m_L, -1, "cdata");
}
// Get the usable (for Harfbuzz) FT_Face object
FT_Face * face = (FT_Face *)lua_topointer(m_L, -1);
lua_pop(m_L, 1); // remove ftface object
// Create a Lua userdata that will keep the reference to our hb_data
// (alloc/free of this userdata is managed by Lua, but not the cleanup
// of the Harfbuzz stuff allocated and stored in it. So, we have set
// to its metatable a __gc function, so it is called when the userdata
// is gc()'ed by Lua, so we can free these Harfbuzz structures).
hb_data = (xtext_hb_font_data *)lua_newuserdata(m_L, sizeof(xtext_hb_font_data));
luaL_getmetatable(m_L, XTEXT_HB_FONT_DATA_METATABLE_NAME);
lua_setmetatable(m_L, -2);
// Set this userdata as the '_hb_font_data' key of our Lua font table
lua_setfield(m_L, -2, XTEXT_LUA_HB_FONT_DATA_TABLE_KEY_NAME);
hb_data->hb_font = hb_ft_font_create_referenced(*face);
// These flags should be sync'ed with freetype.lua FT_Load_Glyph_flags:
// hb_ft_font_set_load_flags(hb_data->hb_font, FT_LOAD_TARGET_LIGHT | FT_LOAD_FORCE_AUTOHINT);
// No hinting, as it would mess synthetized bold.
hb_ft_font_set_load_flags(hb_data->hb_font, FT_LOAD_TARGET_LIGHT | FT_LOAD_NO_AUTOHINT | FT_LOAD_NO_HINTING);
hb_data->hb_buffer = hb_buffer_create();
hb_data->hb_features_nb = 0;
hb_data->hb_features = NULL;
// We can set what OTF features to use from Lua
lua_getfield(m_L, -1, "hb_features");
if ( lua_istable(m_L, -1) ) {
lua_pushnil(m_L); /* first key */
while ( lua_next(m_L, -2) != 0 ) {
if ( lua_isstring(m_L, -1) ) {
size_t len;
const char * feature = lua_tolstring(m_L, -1, &len);
// printf("hbfont feature: %s\n", feature);
hb_feature_t f;
if ( hb_feature_from_string(feature, len, &f) ) {
hb_data->hb_features_nb++;
hb_data->hb_features = (hb_feature_t*)realloc( hb_data->hb_features,
hb_data->hb_features_nb * sizeof(hb_feature_t) );
if ( hb_data->hb_features )
hb_data->hb_features[hb_data->hb_features_nb-1] = f;
}
}
lua_pop(m_L, 1); // remove fetched value, but keep key for next iteration
}
}
// printf("hbfont #features: %d\n", hb_data->hb_features_nb);
lua_settop(m_L, stack_orig_top); // restore stack / drop our added work stuff
return hb_data;
}
void measure() {
if ( m_is_measured )
return;
if ( m_length == 0 ) {
// Nothing to allocate nor measure
m_width = 0;
m_is_measured = true;
return;
}
allocate();
checkBidi();
if ( !s_libunibreak_init_done ) {
s_libunibreak_init_done = true;
init_linebreak();
}
struct LineBreakContext lbCtx;
int final_width = 0;
int prev_para_start = 0;
int start = 0; // start of segment to be measured
FriBidiLevel last_bidi_level = 0;
FriBidiLevel new_bidi_level = 0;
hb_unicode_funcs_t* unicode_funcs = hb_unicode_funcs_get_default();
hb_script_t prev_script = HB_SCRIPT_COMMON;
for ( int i=0; i<=m_length; i++ ) {
bool end_of_text = i == m_length;
// Bidi handling
bool bidi_level_changed = false;
int last_direction = 1; // LTR if no bidi found
if ( m_has_bidi ) {
new_bidi_level = i < m_length ? m_bidi_levels[i] : last_bidi_level;
if ( i == 0 )
last_bidi_level = new_bidi_level;
else if ( new_bidi_level != last_bidi_level )
bidi_level_changed = true;
if ( FRIBIDI_LEVEL_IS_RTL(last_bidi_level) )
last_direction = -1; // RTL
}
// Text Unicode script change
// Arabic surrounded by hebrew chars would not get its letters joined
// if they were all shaped as a single segment. This may probably happen
// too with some complex LTR scripts like indic surrounded by latin.
// Note: libraqm and Lua library https://github.com/luapower/tr do
// a bit more than that by trying to make neutral paired characters part
// of a same script segment (_raqm_resolve_scripts(), using a stack, so
// probably costly and needing another pass). We don't do that for now.
bool script_changed = false;
if ( i < m_length ) {
hb_script_t script = hb_unicode_script(unicode_funcs, m_text[i]);
if ( script != HB_SCRIPT_COMMON && script != HB_SCRIPT_INHERITED && script != HB_SCRIPT_UNKNOWN ) {
if ( prev_script != HB_SCRIPT_COMMON && script != prev_script ) {
m_charinfo[i].flags |= CHAR_SCRIPT_CHANGE;
script_changed = true;
m_has_multiple_scripts = true;
}
prev_script = script;
}
// Note: as we have here guessed the script of what's to be measured
// next, we could store it in m_charinfo (an additional int32...),
// so we can pass it to getHbFontData() (or use it and check
// ourselves here), to skip fallback fonts that do not support
// this script - if fonts announce the scripts they support (I think
// I have seen that in some font tables, may be OTF only?).
}
// Line breaking and wrapping
bool line_break = false;
if ( i == 0 ) {
lb_init_break_context(&lbCtx, m_text[i], m_lang ? m_lang : default_lang);
}
else {
// When at end of m_text, add a letter ('Z') so a trailing \n can be
// flagged as CHAR_MUST_BREAK_AFTER, so we can show an empty line
// and allow the cursor to be positioned after that last \n.
int ch = i < m_length ? m_text[i] : 'Z';
int brk = lb_process_next_char(&lbCtx, ch);
// This tells us about a break between previous char and this 'ch'.
// printf("between <%c%c>: brk %d\n", m_text[i-1], m_text[i], brk);
// Note: LINEBREAK_ALLOWBREAK is set on the last space in a sequence
// of multiple consecutive spaces.
if ( m_text[i-1] == '\t' ) {
// Previous note also applies to tabs: but allow break
// after any tab (so, between any consecutive tabs)
m_charinfo[i-1].flags |= CHAR_CAN_WRAP_AFTER;
m_charinfo[i-1].flags |= CHAR_SKIP_ON_BREAK; // skip when at end of line
m_charinfo[i-1].flags |= CHAR_CAN_EXTEND_WIDTH; // (frontend can ignore that)
m_charinfo[i-1].flags |= CHAR_IS_TAB;
}
else if ( brk == LINEBREAK_ALLOWBREAK ) {
// Happens between a space (at i-1) and its following non-space
// char, or after each CJK char.
m_charinfo[i-1].flags |= CHAR_CAN_WRAP_AFTER;
// We trust libunibreak to not set it on non-break spaces, but
// we have to manually check for spaces that we can skip on break
// and those with a not-fixed width that we can extend when justifying
// text. List of space chars at http://jkorpela.fi/chars/spaces.html
uint32_t pch = m_text[i-1];
if ( pch == ' ' || pch == 0x3000 || (pch >= 0x2000 && pch <= 0x200B) ){
m_charinfo[i-1].flags |= CHAR_SKIP_ON_BREAK; // skip when at end of line
if ( pch == ' ' ) { // others have a fixed width, and not for IDEOGRAPHIC SPACE
m_charinfo[i-1].flags |= CHAR_CAN_EXTEND_WIDTH; // for text justification
}
}
// In case there's no space (pure CJK line), and we want text
// justification, allow extending width of all allowbreak chars
// (we could check if pch is really a CJK one, but let's take
// this shortcut for now). This can be ignored in frontend if
// it looks ugly or is not wanted by CJK readers.
m_charinfo[i-1].flags |= CHAR_CAN_EXTEND_WIDTH_FALLBACK;
}
else if ( brk == LINEBREAK_MUSTBREAK ) {
// Happens between "\n" (at i-1) and its follow up char
m_charinfo[i-1].flags |= CHAR_MUST_BREAK_AFTER;
m_charinfo[i-1].flags |= CHAR_SKIP_ON_BREAK;
line_break = true;
}
else if ( m_text[i-1] == 0x00A0 ) { // regular no-break-space with a non-fixed width
m_charinfo[i-1].flags |= CHAR_CAN_EXTEND_WIDTH; // for text justification
}
}
if ( i>start && (bidi_level_changed || script_changed || line_break || end_of_text) ) {
int hints = 0;
if ( start == prev_para_start ) {
hints |= HINT_BEGINS_PARAGRAPH;
// We set this fact in m_charinfo too, so it's available to shapeLine()
m_charinfo[start].flags |= CHAR_IS_PARA_START;
}
if ( line_break || i == m_length ) {
hints |= HINT_ENDS_PARAGRAPH;
if ( line_break && i-2 >= start) {
m_charinfo[i-2].flags |= CHAR_IS_PARA_END;
}
else {
m_charinfo[i-1].flags |= CHAR_IS_PARA_END;
}
}
if ( last_direction < 0 ) {
hints |= HINT_DIRECTION_IS_RTL;
}
int end = line_break ? i-1 : i;
int w = measureSegment(0, start, end, hints); // measure with font #0
if ( w != NOT_MEASURED )
final_width += w;
start = i;
if ( line_break )
prev_para_start = i;
}
last_bidi_level = new_bidi_level;
}
m_width = final_width;
m_is_measured = true;
}
// Based on crengine/src/lvfntman.cpp measureText() with _kerningMode == KERNING_MODE_HARFBUZZ
// Changes:
// - we work on the full m_text/m_charinfo, with absolute indices start and end (end excluded)
// - we don't use cumulative widths: we store individual char widths (to store them in 16 bits
// in m_charinfo, instead of needing a full 32 bits int for each char)
int measureSegment(int font_num, int start, int end, int hints) {
if ( font_num > MAX_FONT_NUM )
return NOT_MEASURED;
#ifdef DEBUG_MEASURE_TEXT
char indent[32];
int n = 0;
for (; n<font_num; n++) {
indent[n*2] = ' ';
indent[n*2+1] = ' ';
}
indent[n*2] = 0;
#endif
int len = end - start;
if ( len <= 0 )
return NOT_MEASURED;
xtext_hb_font_data * hb_data = getHbFontData(font_num);
if ( !hb_data ) // No such font (so, no more fallback font)
return NOT_MEASURED;
hb_font_t * _hb_font = hb_data->hb_font;
hb_buffer_t * _hb_buffer = hb_data->hb_buffer;
hb_feature_t * _hb_features = hb_data->hb_features;
int _hb_features_nb = hb_data->hb_features_nb;
// Fill HarfBuzz buffer
hb_buffer_clear_contents(_hb_buffer);
// for (int i = start; i < end; i++) {
// hb_buffer_add(_hb_buffer, (hb_codepoint_t)(m_text[i]), i);
// }
hb_buffer_add_codepoints(_hb_buffer, (hb_codepoint_t*)m_text, m_length, start, end-start);
hb_buffer_set_content_type(_hb_buffer, HB_BUFFER_CONTENT_TYPE_UNICODE);
// If we are provided with direction and hints, let harfbuzz know
if ( hints & HINT_DIRECTION_IS_RTL )
hb_buffer_set_direction(_hb_buffer, HB_DIRECTION_RTL);
else
hb_buffer_set_direction(_hb_buffer, HB_DIRECTION_LTR);
int hb_flags = HB_BUFFER_FLAG_DEFAULT; // (hb_buffer_flags_t won't let us do |= )
if ( hints & HINT_BEGINS_PARAGRAPH )
hb_flags |= HB_BUFFER_FLAG_BOT;
if ( hints & HINT_ENDS_PARAGRAPH )
hb_flags |= HB_BUFFER_FLAG_EOT;
hb_buffer_set_flags(_hb_buffer, (hb_buffer_flags_t)hb_flags);
// If we got a specified language or a default one, let harfbuzz know
if ( m_lang )
hb_buffer_set_language(_hb_buffer, m_hb_language);
else if (default_lang)
hb_buffer_set_language(_hb_buffer, default_lang_hb_language);
// Let HB guess what's not been set (script, direction, language)
hb_buffer_guess_segment_properties(_hb_buffer);
// printf("HBlanguage: %s\n", hb_language_to_string(hb_buffer_get_language(_hb_buffer)));
// Shape
hb_shape(_hb_font, _hb_buffer, _hb_features, _hb_features_nb);
// Harfbuzz has guessed and set a direction even if we did not provide one.
bool is_rtl = false;
if ( hb_buffer_get_direction(_hb_buffer) == HB_DIRECTION_RTL ) {
is_rtl = true;
// "For buffers in the right-to-left (RTL) or bottom-to-top (BTT) text
// flow direction, the directionality of the buffer itself is reversed
// for final output as a matter of design. Therefore, HarfBuzz inverts
// the monotonic property: client programs are guaranteed that
// monotonically increasing initial cluster values will be returned as
// monotonically decreasing final cluster values."
// hb_buffer_reverse_clusters() puts the advance on the last char of a
// cluster, unlike hb_buffer_reverse() which puts it on the first, which
// looks more natural (like it happens when LTR).
// But hb_buffer_reverse_clusters() is required to have the clusters
// ordered as our text indices, so we can map them back to our text.
hb_buffer_reverse_clusters(_hb_buffer);
}
int glyph_count = hb_buffer_get_length(_hb_buffer);
hb_glyph_info_t * glyph_info = hb_buffer_get_glyph_infos(_hb_buffer, 0);
hb_glyph_position_t * glyph_pos = hb_buffer_get_glyph_positions(_hb_buffer, 0);
#ifdef DEBUG_MEASURE_TEXT
printf("%sMSHB >>> measureSegment start=%d len=%d is_rtl=%d [font#%d]\n",
indent, start, len, is_rtl, font_num);
for (int i = 0; i < (int)glyph_count; i++) {
char glyphname[32];
hb_font_get_glyph_name(_hb_font, glyph_info[i].codepoint, glyphname, sizeof(glyphname));
printf("%sMSHB g%d c%d(=t:%x) [%x %s]\tadvance=(%d,%d)", indent, i, glyph_info[i].cluster,
m_text[glyph_info[i].cluster], glyph_info[i].codepoint, glyphname,
FONT_METRIC_TO_PX(glyph_pos[i].x_advance), FONT_METRIC_TO_PX(glyph_pos[i].y_advance));
if (glyph_pos[i].x_offset || glyph_pos[i].y_offset)
printf("\toffset=(%d,%d)", FONT_METRIC_TO_PX(glyph_pos[i].x_offset), FONT_METRIC_TO_PX(glyph_pos[i].y_offset));
printf("\n");
}
printf("%sMSHB ---\n", indent);
#endif
// We need to set widths and flags on our original text.
// hb_shape() has modified buffer to contain glyphs, and text
// and buffer may desync (because of clusters, ligatures...)
// in both directions in a same run.
// Also, a cluster must not be cut, so we want to set the same
// width to all our original text chars that are part of the
// same cluster (so 2nd+ chars in a cluster will get a 0-width,
// and, when splitting lines, will fit on a line with the
// cluster leading char).
// So run along our original text (chars, t), and try to follow
// harfbuzz buffer (glyphs, hg), putting the advance of all
// the glyphs that belong to the same cluster (hcl) on the
// first char that started that cluster (and 0-width on the
// followup chars).
// It looks like Harfbuzz makes a cluster of combined glyphs
// even when the font does not have any or all of the required
// glyphs:
// When meeting a not-found glyph (codepoint=0, name=".notdef"),
// we record the original starting t of that cluster, and
// keep processing (possibly other chars with .notdef glyphs,
// giving them the width of the 'tofu' char), until we meet a char
// with a found glyph. We then hold on on this one, while we go
// measureSegment() the previous segment of text (that got .notdef
// glyphs) with a fallback font, and update the wrong widths
// and flags.
int final_width = 0;
int cur_cluster = 0;
int hg = 0; // index in glyph_info/glyph_pos
int hcl = 0; // cluster number of glyph at hg
int t_notdef_start = -1;
int t_notdef_end = -1;
int notdef_width = 0;
for ( int t = start; t < end; t++ ) {
#ifdef DEBUG_MEASURE_TEXT
printf("%sMSHB t%d (=%x) ", indent, t, m_text[t]);
#endif
// Grab all glyphs that do not belong to a cluster greater that our char position
int cur_width = 0; // current cluster width
while ( hg < glyph_count ) {
hcl = glyph_info[hg].cluster;
if ( hcl <= t ) { // glyph still part of a previous cluster
int advance = 0;
if ( glyph_info[hg].codepoint != 0 ) { // Codepoint found in this font
#ifdef DEBUG_MEASURE_TEXT
printf("(found cp=%x) ", glyph_info[hg].codepoint);
#endif
// Note: in crengine, we needed to add the following additional condition
// to only process past notdef when the first glyph of a cluster is found.
// This strangely seems not needed here (the thai sample that caused issues
// with crengine displays fine in xtext), but let's add it for consistency.
if ( t_notdef_start >= 0 && hcl > cur_cluster ) {
// We have a segment of previous ".notdef", and this glyph starts a new cluster
t_notdef_end = t;
// Let a fallback font replace the wrong values in widths and flags
// No-op if there is no more fallback font
#ifdef DEBUG_MEASURE_TEXT
printf("%s[...]\n%sMSHB ### measuring past failures with fallback font %d>%d\n",
indent, indent, t_notdef_start, t_notdef_end);
#endif
// Drop BOT/EOT flags if this segment is not at start/end
int fb_hints = hints;
if ( t_notdef_start > 0 )
fb_hints &= ~HINT_BEGINS_PARAGRAPH;
if ( t_notdef_end < len )
fb_hints &= ~HINT_ENDS_PARAGRAPH;
int fallback_width = measureSegment( font_num+1, t_notdef_start, t_notdef_end, fb_hints );
if ( fallback_width != NOT_MEASURED ) {
// The individual char widths will have been updated,
// but we need to correct final_width where we kept
// adding notdef widths
final_width = final_width - notdef_width + fallback_width;
}
#ifdef DEBUG_MEASURE_TEXT
printf("%sMSHB ### measured past failures > W= %d\n%s[...]",
indent, fallback_width, indent);
#endif
t_notdef_start = -1;
notdef_width = 0;
// And go on with the found glyph now that we fixed what was before
}
// Glyph found in this font
advance = FONT_METRIC_TO_PX(glyph_pos[hg].x_advance);
}
else {
#ifdef DEBUG_MEASURE_TEXT
printf("(glyph not found) ");
#endif
// Keep the advance of .notdef/tofu in case there is no fallback font to correct them
advance = FONT_METRIC_TO_PX(glyph_pos[hg].x_advance);
if ( t_notdef_start < 0 ) {
t_notdef_start = t;
}
}
#ifdef DEBUG_MEASURE_TEXT
printf("c%d+%d ", hcl, advance);
#endif
cur_width += advance;
cur_cluster = hcl;
hg++;
continue; // keep grabbing glyphs
}
break;
}
// Done grabbing clustered glyphs: they contributed to cur_width.
if ( t > cur_cluster ) {
// Our char is part of a cluster that started on a previous char
m_charinfo[t].width = 0;
m_charinfo[t].flags |= CHAR_IS_CLUSTER_TAIL;
// todo: see at using HB_GLYPH_FLAG_UNSAFE_TO_BREAK to
// set this flag instead/additionally
}
else {
// We're either a single char cluster, or the start
// of a multi chars cluster.
m_charinfo[t].width = cur_width; // get all the width
final_width += cur_width;
// It seems each soft-hyphen is in its own cluster, of length 1 and width 0,
// so HarfBuzz must already deal correctly with soft-hyphens.
if ( t_notdef_start >= 0 ) {
// If we had one glyph not found, we'll measure the whole cluster with