This repository has been archived by the owner on Jul 7, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8.hpp
163 lines (135 loc) · 9.98 KB
/
utf8.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// Copyright 2022 The Jule Programming Language.
// Use of this source code is governed by a BSD 3-Clause
// license that can be found in the LICENSE file.
#ifndef __JULE_UTF8_HPP
#define __JULE_UTF8_HPP
//
// Implements functions and constants to support text encoded in
// UTF-8 for Jule strings. It includes functions to translate between
// runes and UTF-8 byte sequences.
// See https://en.wikipedia.org/wiki/UTF-8
//
// Based on std::unicode::utf8
//
#include <tuple>
#include "types.hpp"
#include "slice.hpp"
namespace jule {
constexpr signed int UTF8_RUNE_ERROR{ 65533 };
constexpr signed int UTF8_MASKX{ 63 };
constexpr signed int UTF8_MASK2{ 31 };
constexpr signed int UTF8_MASK3{ 15 };
constexpr signed int UTF8_MASK4{ 7 };
constexpr signed int UTF8_LOCB{ 128 };
constexpr signed int UTF8_HICB{ 191 };
constexpr signed int UTF8_XX{ 241 };
constexpr signed int UTF8_AS{ 240 };
constexpr signed int UTF8_S1{ 2 };
constexpr signed int UTF8_S2{ 19 };
constexpr signed int UTF8_S3{ 3 };
constexpr signed int UTF8_S4{ 35 };
constexpr signed int UTF8_S5{ 52 };
constexpr signed int UTF8_S6{ 4 };
constexpr signed int UTF8_S7{ 68 };
constexpr signed int UTF8_RUNE1_MAX{ 127 };
constexpr signed int UTF8_RUNE2_MAX{ 2047 };
constexpr signed int UTF8_RUNE3_MAX{ 65535 };
constexpr signed int UTF8_TX{ 128 };
constexpr signed int UTF8_T2{ 192 };
constexpr signed int UTF8_T3{ 224 };
constexpr signed int UTF8_T4{ 240 };
constexpr signed int UTF8_MAX_RUNE{ 1114111 };
constexpr signed int UTF8_SURROGATE_MIN{ 55296 };
constexpr signed int UTF8_SURROGATE_MAX{ 57343 };
// Declarations
struct UTF8AcceptRange;
std::tuple<jule::I32, jule::Int> utf8_decode_rune_str(const char *s, const jule::Int &len) noexcept;
jule::Slice<jule::U8> utf8_rune_to_bytes(const jule::I32 &r) noexcept;
// Definitions
constexpr jule::U8 utf8_first[256] = {
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS, jule::UTF8_AS,
jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX,
jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX,
jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX,
jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX,
jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1,
jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1, jule::UTF8_S1,
jule::UTF8_S2, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S3, jule::UTF8_S4, jule::UTF8_S3, jule::UTF8_S3,
jule::UTF8_S5, jule::UTF8_S6, jule::UTF8_S6, jule::UTF8_S6, jule::UTF8_S7, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX, jule::UTF8_XX,
};
struct UTF8AcceptRange{ const jule::U8 lo, hi; };
constexpr struct jule::UTF8AcceptRange utf8_accept_ranges[16] = {
{ jule::UTF8_LOCB, jule::UTF8_HICB },
{ 0xA0, jule::UTF8_HICB },
{ jule::UTF8_LOCB, 0x9F },
{ 0x90, jule::UTF8_HICB },
{ jule::UTF8_LOCB, 0x8F },
};
std::tuple<jule::I32, jule::Int>
utf8_decode_rune_str(const char *s, const jule::Int &len) noexcept {
if (len < 1)
return std::make_tuple<jule::I32, jule::Int>(jule::UTF8_RUNE_ERROR, 0);
const jule::U8 s0{ static_cast<jule::U8>(s[0]) };
const jule::U8 x{ jule::utf8_first[s0] };
if (x >= jule::UTF8_AS) {
const jule::I32 mask{ x << 31 >> 31 };
return std::make_tuple( (static_cast<jule::I32>(s[0])&~mask) |
(jule::UTF8_RUNE_ERROR&mask), 1);
}
const jule::Int sz{ static_cast<jule::Int>(x&7) };
const struct jule::UTF8AcceptRange accept{ jule::utf8_accept_ranges[x>>4] };
if (len < sz)
return std::make_tuple<jule::I32, jule::Int>( jule::UTF8_RUNE_ERROR, 1 );
const jule::U8 s1{ static_cast<jule::U8>(s[1]) };
if (s1 < accept.lo || accept.hi < s1)
return std::make_tuple<jule::I32, jule::Int>(jule::UTF8_RUNE_ERROR, 1);
if (sz <= 2)
return std::make_tuple<jule::I32, jule::Int>(
(static_cast<jule::I32>(s0&jule::UTF8_MASK2)<<6) |
static_cast<jule::I32>(s1&jule::UTF8_MASKX), 2);
const jule::U8 s2{ static_cast<jule::U8>(s[2]) };
if (s2 < jule::UTF8_LOCB || jule::UTF8_HICB < s2)
return std::make_tuple<jule::I32, jule::Int>(jule::UTF8_RUNE_ERROR, 1);
if (sz <= 3)
return std::make_tuple<jule::I32, jule::Int>(
(static_cast<jule::I32>(s0&jule::UTF8_MASK3)<<12) |
(static_cast<jule::I32>(s1&jule::UTF8_MASKX)<<6) |
static_cast<jule::I32>(s2&jule::UTF8_MASKX), 3);
const jule::U8 s3{ static_cast<jule::U8>(s[3]) };
if (s3 < jule::UTF8_LOCB || jule::UTF8_HICB < s3)
return std::make_tuple<jule::I32, jule::Int>(jule::UTF8_RUNE_ERROR, 1);
return std::make_tuple((static_cast<jule::I32>(s0&jule::UTF8_MASK4)<<18) |
(static_cast<jule::I32>(s1&jule::UTF8_MASKX)<<12) |
(static_cast<jule::I32>(s2&jule::UTF8_MASKX)<<6) |
static_cast<jule::I32>(s3&jule::UTF8_MASKX), 4);
}
jule::Slice<jule::U8> utf8_rune_to_bytes(const jule::I32 &r) noexcept {
if (static_cast<jule::U32>(r) <= jule::UTF8_RUNE1_MAX)
return jule::Slice<jule::U8>({static_cast<jule::U8>(r)});
const jule::U32 i{ static_cast<jule::U32>(r) };
if (i < jule::UTF8_RUNE2_MAX) {
return jule::Slice<jule::U8>({ static_cast<jule::U8>(jule::UTF8_T2|static_cast<jule::U8>(r>>6)),
static_cast<jule::U8>(jule::UTF8_TX|(static_cast<jule::U8>(r)&jule::UTF8_MASKX)) });
}
jule::I32 _r{ r };
if (i > jule::UTF8_MAX_RUNE ||
jule::UTF8_SURROGATE_MIN <= i && i <= jule::UTF8_SURROGATE_MAX)
_r = jule::UTF8_RUNE_ERROR;
if (i <= jule::UTF8_RUNE3_MAX)
return jule::Slice<jule::U8>({ static_cast<jule::U8>(jule::UTF8_T3|static_cast<jule::U8>(_r>>12)),
static_cast<jule::U8>(jule::UTF8_TX|(static_cast<jule::U8>(_r>>6)&jule::UTF8_MASKX)),
static_cast<jule::U8>(jule::UTF8_TX|(static_cast<jule::U8>(_r)&jule::UTF8_MASKX)) });
return jule::Slice<jule::U8>({ static_cast<jule::U8>(jule::UTF8_T4|static_cast<jule::U8>(_r>>18)),
static_cast<jule::U8>(jule::UTF8_TX|(static_cast<jule::U8>(_r>>12)&jule::UTF8_MASKX)),
static_cast<jule::U8>(jule::UTF8_TX|(static_cast<jule::U8>(_r>>6)&jule::UTF8_MASKX)),
static_cast<jule::U8>(jule::UTF8_TX|(static_cast<jule::U8>(_r)&jule::UTF8_MASKX)) });
}
} // namespace jule
#endif // #ifndef __JULE_UTF8_HPP