Line |
Branch |
Decision |
Exec |
Source |
1 |
|
|
|
/* |
2 |
|
|
|
** $Id: lutf8lib.c $ |
3 |
|
|
|
** Standard library for UTF-8 manipulation |
4 |
|
|
|
** See Copyright Notice in lua.h |
5 |
|
|
|
*/ |
6 |
|
|
|
|
7 |
|
|
|
#define lutf8lib_c |
8 |
|
|
|
#define LUA_LIB |
9 |
|
|
|
|
10 |
|
|
|
#include "lprefix.h" |
11 |
|
|
|
|
12 |
|
|
|
|
13 |
|
|
|
#include <assert.h> |
14 |
|
|
|
#include <limits.h> |
15 |
|
|
|
#include <stdlib.h> |
16 |
|
|
|
#include <string.h> |
17 |
|
|
|
|
18 |
|
|
|
#include "lua.h" |
19 |
|
|
|
|
20 |
|
|
|
#include "lauxlib.h" |
21 |
|
|
|
#include "lualib.h" |
22 |
|
|
|
|
23 |
|
|
|
|
24 |
|
|
|
#define MAXUNICODE 0x10FFFFu |
25 |
|
|
|
|
26 |
|
|
|
#define MAXUTF 0x7FFFFFFFu |
27 |
|
|
|
|
28 |
|
|
|
|
29 |
|
|
|
#define MSGInvalid "invalid UTF-8 code" |
30 |
|
|
|
|
31 |
|
|
|
/* |
32 |
|
|
|
** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits. |
33 |
|
|
|
*/ |
34 |
|
|
|
#if (UINT_MAX >> 30) >= 1 |
35 |
|
|
|
typedef unsigned int utfint; |
36 |
|
|
|
#else |
37 |
|
|
|
typedef unsigned long utfint; |
38 |
|
|
|
#endif |
39 |
|
|
|
|
40 |
|
|
|
|
41 |
|
|
|
#define iscont(c) (((c) & 0xC0) == 0x80) |
42 |
|
|
|
#define iscontp(p) iscont(*(p)) |
43 |
|
|
|
|
44 |
|
|
|
|
45 |
|
|
|
/* from strlib */ |
46 |
|
|
|
/* translate a relative string position: negative means back from end */ |
47 |
|
|
✗ |
static lua_Integer u_posrelat (lua_Integer pos, size_t len) { |
48 |
|
|
✗ |
if (pos >= 0) return pos; |
49 |
|
|
✗ |
else if (0u - (size_t)pos > len) return 0; |
50 |
|
|
✗ |
else return (lua_Integer)len + pos + 1; |
51 |
|
|
|
} |
52 |
|
|
|
|
53 |
|
|
|
|
54 |
|
|
|
/* |
55 |
|
|
|
** Decode one UTF-8 sequence, returning NULL if byte sequence is |
56 |
|
|
|
** invalid. The array 'limits' stores the minimum value for each |
57 |
|
|
|
** sequence length, to check for overlong representations. Its first |
58 |
|
|
|
** entry forces an error for non-ascii bytes with no continuation |
59 |
|
|
|
** bytes (count == 0). |
60 |
|
|
|
*/ |
61 |
|
|
✗ |
static const char *utf8_decode (const char *s, utfint *val, int strict) { |
62 |
|
|
|
static const utfint limits[] = |
63 |
|
|
|
{~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u}; |
64 |
|
|
✗ |
unsigned int c = (unsigned char)s[0]; |
65 |
|
|
✗ |
utfint res = 0; /* final result */ |
66 |
|
|
✗ |
if (c < 0x80) /* ascii? */ |
67 |
|
|
✗ |
res = c; |
68 |
|
|
|
else { |
69 |
|
|
✗ |
int count = 0; /* to count number of continuation bytes */ |
70 |
|
|
✗ |
for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ |
71 |
|
|
✗ |
unsigned int cc = (unsigned char)s[++count]; /* read next byte */ |
72 |
|
|
✗ |
if (!iscont(cc)) /* not a continuation byte? */ |
73 |
|
|
✗ |
return NULL; /* invalid byte sequence */ |
74 |
|
|
✗ |
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ |
75 |
|
|
|
} |
76 |
|
|
✗ |
res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ |
77 |
|
|
✗ |
if (count > 5 || res > MAXUTF || res < limits[count]) |
78 |
|
|
✗ |
return NULL; /* invalid byte sequence */ |
79 |
|
|
✗ |
s += count; /* skip continuation bytes read */ |
80 |
|
|
|
} |
81 |
|
|
✗ |
if (strict) { |
82 |
|
|
|
/* check for invalid code points; too large or surrogates */ |
83 |
|
|
✗ |
if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu)) |
84 |
|
|
✗ |
return NULL; |
85 |
|
|
|
} |
86 |
|
|
✗ |
if (val) *val = res; |
87 |
|
|
✗ |
return s + 1; /* +1 to include first byte */ |
88 |
|
|
|
} |
89 |
|
|
|
|
90 |
|
|
|
|
91 |
|
|
|
/* |
92 |
|
|
|
** utf8len(s [, i [, j [, lax]]]) --> number of characters that |
93 |
|
|
|
** start in the range [i,j], or nil + current position if 's' is not |
94 |
|
|
|
** well formed in that interval |
95 |
|
|
|
*/ |
96 |
|
|
✗ |
static int utflen (lua_State *L) { |
97 |
|
|
✗ |
lua_Integer n = 0; /* counter for the number of characters */ |
98 |
|
|
|
size_t len; /* string length in bytes */ |
99 |
|
|
✗ |
const char *s = luaL_checklstring(L, 1, &len); |
100 |
|
|
✗ |
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); |
101 |
|
|
✗ |
lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); |
102 |
|
|
✗ |
int lax = lua_toboolean(L, 4); |
103 |
|
|
✗ |
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, |
104 |
|
|
|
"initial position out of bounds"); |
105 |
|
|
✗ |
luaL_argcheck(L, --posj < (lua_Integer)len, 3, |
106 |
|
|
|
"final position out of bounds"); |
107 |
|
|
✗ |
while (posi <= posj) { |
108 |
|
|
✗ |
const char *s1 = utf8_decode(s + posi, NULL, !lax); |
109 |
|
|
✗ |
if (s1 == NULL) { /* conversion error? */ |
110 |
|
|
✗ |
luaL_pushfail(L); /* return fail ... */ |
111 |
|
|
✗ |
lua_pushinteger(L, posi + 1); /* ... and current position */ |
112 |
|
|
✗ |
return 2; |
113 |
|
|
|
} |
114 |
|
|
✗ |
posi = s1 - s; |
115 |
|
|
✗ |
n++; |
116 |
|
|
|
} |
117 |
|
|
✗ |
lua_pushinteger(L, n); |
118 |
|
|
✗ |
return 1; |
119 |
|
|
|
} |
120 |
|
|
|
|
121 |
|
|
|
|
122 |
|
|
|
/* |
123 |
|
|
|
** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all |
124 |
|
|
|
** characters that start in the range [i,j] |
125 |
|
|
|
*/ |
126 |
|
|
✗ |
static int codepoint (lua_State *L) { |
127 |
|
|
|
size_t len; |
128 |
|
|
✗ |
const char *s = luaL_checklstring(L, 1, &len); |
129 |
|
|
✗ |
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); |
130 |
|
|
✗ |
lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); |
131 |
|
|
✗ |
int lax = lua_toboolean(L, 4); |
132 |
|
|
|
int n; |
133 |
|
|
|
const char *se; |
134 |
|
|
✗ |
luaL_argcheck(L, posi >= 1, 2, "out of bounds"); |
135 |
|
|
✗ |
luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds"); |
136 |
|
|
✗ |
if (posi > pose) return 0; /* empty interval; return no values */ |
137 |
|
|
✗ |
if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ |
138 |
|
|
✗ |
return luaL_error(L, "string slice too long"); |
139 |
|
|
✗ |
n = (int)(pose - posi) + 1; /* upper bound for number of returns */ |
140 |
|
|
✗ |
luaL_checkstack(L, n, "string slice too long"); |
141 |
|
|
✗ |
n = 0; /* count the number of returns */ |
142 |
|
|
✗ |
se = s + pose; /* string end */ |
143 |
|
|
✗ |
for (s += posi - 1; s < se;) { |
144 |
|
|
|
utfint code; |
145 |
|
|
✗ |
s = utf8_decode(s, &code, !lax); |
146 |
|
|
✗ |
if (s == NULL) |
147 |
|
|
✗ |
return luaL_error(L, MSGInvalid); |
148 |
|
|
✗ |
lua_pushinteger(L, code); |
149 |
|
|
✗ |
n++; |
150 |
|
|
|
} |
151 |
|
|
✗ |
return n; |
152 |
|
|
|
} |
153 |
|
|
|
|
154 |
|
|
|
|
155 |
|
|
✗ |
static void pushutfchar (lua_State *L, int arg) { |
156 |
|
|
✗ |
lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg); |
157 |
|
|
✗ |
luaL_argcheck(L, code <= MAXUTF, arg, "value out of range"); |
158 |
|
|
✗ |
lua_pushfstring(L, "%U", (long)code); |
159 |
|
|
✗ |
} |
160 |
|
|
|
|
161 |
|
|
|
|
162 |
|
|
|
/* |
163 |
|
|
|
** utfchar(n1, n2, ...) -> char(n1)..char(n2)... |
164 |
|
|
|
*/ |
165 |
|
|
✗ |
static int utfchar (lua_State *L) { |
166 |
|
|
✗ |
int n = lua_gettop(L); /* number of arguments */ |
167 |
|
|
✗ |
if (n == 1) /* optimize common case of single char */ |
168 |
|
|
✗ |
pushutfchar(L, 1); |
169 |
|
|
|
else { |
170 |
|
|
|
int i; |
171 |
|
|
|
luaL_Buffer b; |
172 |
|
|
✗ |
luaL_buffinit(L, &b); |
173 |
|
|
✗ |
for (i = 1; i <= n; i++) { |
174 |
|
|
✗ |
pushutfchar(L, i); |
175 |
|
|
✗ |
luaL_addvalue(&b); |
176 |
|
|
|
} |
177 |
|
|
✗ |
luaL_pushresult(&b); |
178 |
|
|
|
} |
179 |
|
|
✗ |
return 1; |
180 |
|
|
|
} |
181 |
|
|
|
|
182 |
|
|
|
|
183 |
|
|
|
/* |
184 |
|
|
|
** offset(s, n, [i]) -> index where n-th character counting from |
185 |
|
|
|
** position 'i' starts; 0 means character at 'i'. |
186 |
|
|
|
*/ |
187 |
|
|
✗ |
static int byteoffset (lua_State *L) { |
188 |
|
|
|
size_t len; |
189 |
|
|
✗ |
const char *s = luaL_checklstring(L, 1, &len); |
190 |
|
|
✗ |
lua_Integer n = luaL_checkinteger(L, 2); |
191 |
|
|
✗ |
lua_Integer posi = (n >= 0) ? 1 : len + 1; |
192 |
|
|
✗ |
posi = u_posrelat(luaL_optinteger(L, 3, posi), len); |
193 |
|
|
✗ |
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, |
194 |
|
|
|
"position out of bounds"); |
195 |
|
|
✗ |
if (n == 0) { |
196 |
|
|
|
/* find beginning of current byte sequence */ |
197 |
|
|
✗ |
while (posi > 0 && iscontp(s + posi)) posi--; |
198 |
|
|
|
} |
199 |
|
|
|
else { |
200 |
|
|
✗ |
if (iscontp(s + posi)) |
201 |
|
|
✗ |
return luaL_error(L, "initial position is a continuation byte"); |
202 |
|
|
✗ |
if (n < 0) { |
203 |
|
|
✗ |
while (n < 0 && posi > 0) { /* move back */ |
204 |
|
|
|
do { /* find beginning of previous character */ |
205 |
|
|
✗ |
posi--; |
206 |
|
|
✗ |
} while (posi > 0 && iscontp(s + posi)); |
207 |
|
|
✗ |
n++; |
208 |
|
|
|
} |
209 |
|
|
|
} |
210 |
|
|
|
else { |
211 |
|
|
✗ |
n--; /* do not move for 1st character */ |
212 |
|
|
✗ |
while (n > 0 && posi < (lua_Integer)len) { |
213 |
|
|
|
do { /* find beginning of next character */ |
214 |
|
|
✗ |
posi++; |
215 |
|
|
✗ |
} while (iscontp(s + posi)); /* (cannot pass final '\0') */ |
216 |
|
|
✗ |
n--; |
217 |
|
|
|
} |
218 |
|
|
|
} |
219 |
|
|
|
} |
220 |
|
|
✗ |
if (n == 0) /* did it find given character? */ |
221 |
|
|
✗ |
lua_pushinteger(L, posi + 1); |
222 |
|
|
|
else /* no such character */ |
223 |
|
|
✗ |
luaL_pushfail(L); |
224 |
|
|
✗ |
return 1; |
225 |
|
|
|
} |
226 |
|
|
|
|
227 |
|
|
|
|
228 |
|
|
✗ |
static int iter_aux (lua_State *L, int strict) { |
229 |
|
|
|
size_t len; |
230 |
|
|
✗ |
const char *s = luaL_checklstring(L, 1, &len); |
231 |
|
|
✗ |
lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2); |
232 |
|
|
✗ |
if (n < len) { |
233 |
|
|
✗ |
while (iscontp(s + n)) n++; /* go to next character */ |
234 |
|
|
|
} |
235 |
|
|
✗ |
if (n >= len) /* (also handles original 'n' being negative) */ |
236 |
|
|
✗ |
return 0; /* no more codepoints */ |
237 |
|
|
|
else { |
238 |
|
|
|
utfint code; |
239 |
|
|
✗ |
const char *next = utf8_decode(s + n, &code, strict); |
240 |
|
|
✗ |
if (next == NULL || iscontp(next)) |
241 |
|
|
✗ |
return luaL_error(L, MSGInvalid); |
242 |
|
|
✗ |
lua_pushinteger(L, n + 1); |
243 |
|
|
✗ |
lua_pushinteger(L, code); |
244 |
|
|
✗ |
return 2; |
245 |
|
|
|
} |
246 |
|
|
|
} |
247 |
|
|
|
|
248 |
|
|
|
|
249 |
|
|
✗ |
static int iter_auxstrict (lua_State *L) { |
250 |
|
|
✗ |
return iter_aux(L, 1); |
251 |
|
|
|
} |
252 |
|
|
|
|
253 |
|
|
✗ |
static int iter_auxlax (lua_State *L) { |
254 |
|
|
✗ |
return iter_aux(L, 0); |
255 |
|
|
|
} |
256 |
|
|
|
|
257 |
|
|
|
|
258 |
|
|
✗ |
static int iter_codes (lua_State *L) { |
259 |
|
|
✗ |
int lax = lua_toboolean(L, 2); |
260 |
|
|
✗ |
const char *s = luaL_checkstring(L, 1); |
261 |
|
|
✗ |
luaL_argcheck(L, !iscontp(s), 1, MSGInvalid); |
262 |
|
|
✗ |
lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); |
263 |
|
|
✗ |
lua_pushvalue(L, 1); |
264 |
|
|
✗ |
lua_pushinteger(L, 0); |
265 |
|
|
✗ |
return 3; |
266 |
|
|
|
} |
267 |
|
|
|
|
268 |
|
|
|
|
269 |
|
|
|
/* pattern to match a single UTF-8 character */ |
270 |
|
|
|
#define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*" |
271 |
|
|
|
|
272 |
|
|
|
|
273 |
|
|
|
static const luaL_Reg funcs[] = { |
274 |
|
|
|
{"offset", byteoffset}, |
275 |
|
|
|
{"codepoint", codepoint}, |
276 |
|
|
|
{"char", utfchar}, |
277 |
|
|
|
{"len", utflen}, |
278 |
|
|
|
{"codes", iter_codes}, |
279 |
|
|
|
/* placeholders */ |
280 |
|
|
|
{"charpattern", NULL}, |
281 |
|
|
|
{NULL, NULL} |
282 |
|
|
|
}; |
283 |
|
|
|
|
284 |
|
|
|
|
285 |
|
|
✗ |
LUAMOD_API int luaopen_utf8 (lua_State *L) { |
286 |
|
|
✗ |
luaL_newlib(L, funcs); |
287 |
|
|
✗ |
lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); |
288 |
|
|
✗ |
lua_setfield(L, -2, "charpattern"); |
289 |
|
|
✗ |
return 1; |
290 |
|
|
|
} |
291 |
|
|
|
|
292 |
|
|
|
|