From 6ccf1500396efffeb38ed54ac78d2f2d41a9b762 Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Thu, 6 Feb 2014 15:32:33 -0200 Subject: [PATCH] new library: utf8 --- linit.c | 3 +- lualib.h | 5 +- lutf8lib.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++++++++ makefile | 9 ++- 4 files changed, 244 insertions(+), 6 deletions(-) create mode 100644 lutf8lib.c diff --git a/linit.c b/linit.c index 8fd67fcd..1c0474e6 100644 --- a/linit.c +++ b/linit.c @@ -1,5 +1,5 @@ /* -** $Id: linit.c,v 1.31 2011/01/26 16:30:02 roberto Exp roberto $ +** $Id: linit.c,v 1.32 2011/04/08 19:17:36 roberto Exp roberto $ ** Initialization of libraries for lua.c and other clients ** See Copyright Notice in lua.h */ @@ -34,6 +34,7 @@ static const luaL_Reg loadedlibs[] = { {LUA_IOLIBNAME, luaopen_io}, {LUA_OSLIBNAME, luaopen_os}, {LUA_STRLIBNAME, luaopen_string}, + {LUA_UTF8LIBNAME, luaopen_utf8}, {LUA_BITLIBNAME, luaopen_bit32}, {LUA_MATHLIBNAME, luaopen_math}, {LUA_DBLIBNAME, luaopen_debug}, diff --git a/lualib.h b/lualib.h index 15adcdb4..ea073444 100644 --- a/lualib.h +++ b/lualib.h @@ -1,5 +1,5 @@ /* -** $Id: lualib.h,v 1.42 2011/05/25 14:12:28 roberto Exp roberto $ +** $Id: lualib.h,v 1.43 2011/12/08 12:11:37 roberto Exp roberto $ ** Lua standard libraries ** See Copyright Notice in lua.h */ @@ -29,6 +29,9 @@ LUAMOD_API int (luaopen_os) (lua_State *L); #define LUA_STRLIBNAME "string" LUAMOD_API int (luaopen_string) (lua_State *L); +#define LUA_UTF8LIBNAME "utf8" +LUAMOD_API int (luaopen_utf8) (lua_State *L); + #define LUA_BITLIBNAME "bit32" LUAMOD_API int (luaopen_bit32) (lua_State *L); diff --git a/lutf8lib.c b/lutf8lib.c new file mode 100644 index 00000000..dfd52832 --- /dev/null +++ b/lutf8lib.c @@ -0,0 +1,233 @@ +/* gcc -shared -o utf8.so -fpic -O2 -Wall -I.. utf8.c */ + +#include +#include +#include + +#include "lua.h" +#include "lauxlib.h" + +#define MAXUNICODE 0x10FFFF + +#define iscont(p) ((*(p) & 0xC0) == 0x80) + + +/* from strlib */ +/* translate a relative string position: negative means back from end */ +static lua_Integer posrelat (lua_Integer pos, size_t len) { + if (pos >= 0) return pos; + else if (0u - (size_t)pos > len) return 0; + else return (lua_Integer)len + pos + 1; +} + + +/* +** Decode an UTF-8 sequence, returning NULL if byte sequence is invalid. +*/ +static const char *utf8_decode (const char *o, int *val) { + static unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF}; + const unsigned char *s = (const unsigned char *)o; + unsigned int c = s[0]; + unsigned int res = 0; /* final result */ + if (c < 0x80) /* ascii? */ + res = c; + else { + int count = 0; /* to count number of continuation bytes */ + while (c & 0x40) { /* still have continuation bytes? */ + int cc = s[++count]; /* read next byte */ + if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ + return NULL; /* invalid byte sequence */ + res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ + c <<= 1; /* to test next bit */ + } + res |= ((c & 0x7F) << (count * 5)); /* add first byte */ + if (count > 3 || res > MAXUNICODE || res <= limits[count]) + return NULL; /* invalid byte sequence */ + s += count; /* skip continuation bytes read */ + } + if (val) *val = res; + return (const char *)s + 1; /* +1 to include first byte */ +} + + +/* +** utf8len(s, [i]) --> number of codepoints in 's' after 'i'; +** nil if 's' not well formed +*/ +static int utflen (lua_State *L) { + int n = 0; + const char *ends; + size_t len; + const char *s = luaL_checklstring(L, 1, &len); + lua_Integer posi = posrelat(luaL_optinteger(L, 2, 1), 1); + luaL_argcheck(L, 1 <= posi && posi <= (lua_Integer)len, 1, + "initial position out of string"); + ends = s + len; + s += posi - 1; + while (s < ends && (s = utf8_decode(s, NULL)) != NULL) + n++; + if (s == ends) + lua_pushinteger(L, n); + else + lua_pushnil(L); + return 1; +} + + +/* +** codepoint(s, [i, [j]]) -> returns codepoints for all characters +** between i and j +*/ +static int codepoint (lua_State *L) { + size_t len; + const char *s = luaL_checklstring(L, 1, &len); + lua_Integer posi = posrelat(luaL_optinteger(L, 2, 1), len); + lua_Integer pose = posrelat(luaL_optinteger(L, 3, posi), len); + int n; + const char *se; + luaL_argcheck(L, posi >= 1, 2, "out of range"); + luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range"); + if (posi > pose) return 0; /* empty interval; return no values */ + n = (int)(pose - posi + 1); + if (posi + n <= pose) /* (lua_Integer -> int) overflow? */ + return luaL_error(L, "string slice too long"); + luaL_checkstack(L, n, "string slice too long"); + n = 0; + se = s + pose; + for (s += posi - 1; s < se;) { + int code; + s = utf8_decode(s, &code); + if (s == NULL) + luaL_error(L, "invalid UTF-8 code"); + lua_pushinteger(L, code); + n++; + } + return n; +} + + +static void pushutfchar (lua_State *L, int arg) { + int code = luaL_checkint(L, arg); + luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); + lua_pushfstring(L, "%U", code); +} + + +/* +** utfchar(n1, n2, ...) -> char(n1)..char(n2)... +*/ +static int utfchar (lua_State *L) { + int n = lua_gettop(L); /* number of arguments */ + if (n == 1) /* optimize common case of single char */ + pushutfchar(L, 1); + else { + int i; + luaL_Buffer b; + luaL_buffinit(L, &b); + for (i = 1; i <= n; i++) { + pushutfchar(L, i); + luaL_addvalue(&b); + } + luaL_pushresult(&b); + } + return 1; +} + + +/* +** offset(s, n, [i]) -> index where n-th character *after* +** position 'i' starts; 0 means character at 'i'. +*/ +static int byteoffset (lua_State *L) { + size_t len; + const char *s = luaL_checklstring(L, 1, &len); + int n = luaL_checkint(L, 2); + lua_Integer posi = posrelat(luaL_optinteger(L, 3, 1), len) - 1; + luaL_argcheck(L, 0 <= posi && posi <= (lua_Integer)len, 3, + "position out of range"); + if (n == 0) { + /* find beginning of current byte sequence */ + while (posi > 0 && iscont(s + posi)) posi--; + } + else if (n < 0) { + while (n < 0 && posi > 0) { /* move back */ + do { /* find beginning of previous character */ + posi--; + } while (posi > 0 && iscont(s + posi)); + n++; + } + } + else { + n--; /* do not move for 1st character */ + while (n > 0 && posi < (lua_Integer)len) { + do { /* find beginning of next character */ + posi++; + } while (iscont(s + posi)); /* ('\0' is not continuation) */ + n--; + } + } + if (n == 0) + lua_pushinteger(L, posi + 1); + else + lua_pushnil(L); /* no such position */ + return 1; +} + + +static int iter_aux (lua_State *L) { + size_t len; + const char *s = luaL_checklstring(L, 1, &len); + int n = lua_tointeger(L, 2) - 1; + if (n < 0) /* first iteration? */ + n = 0; /* start from here */ + else if (n < (lua_Integer)len) { + n++; /* skip current byte */ + while (iscont(s + n)) n++; /* and its continuations */ + } + if (n >= (lua_Integer)len) + return 0; /* no more codepoints */ + else { + int code; + const char *next = utf8_decode(s + n, &code); + if (next == NULL || iscont(next)) + luaL_error(L, "invalid UTF-8 code"); + lua_pushinteger(L, n + 1); + lua_pushinteger(L, code); + return 2; + } +} + + +static int iter_codes (lua_State *L) { + luaL_checkstring(L, 1); + lua_pushcfunction(L, iter_aux); + lua_pushvalue(L, 1); + lua_pushinteger(L, 0); + return 3; +} + + +/* pattern to match a single UTF-8 character */ +#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" + + +static struct luaL_Reg funcs[] = { + {"offset", byteoffset}, + {"codepoint", codepoint}, + {"char", utfchar}, + {"len", utflen}, + {"codes", iter_codes}, + {NULL, NULL} +}; + + + +int luaopen_utf8 (lua_State *L); + +int luaopen_utf8 (lua_State *L) { + luaL_newlib(L, funcs); + lua_pushliteral(L, UTF8PATT); + lua_setfield(L, -2, "charpatt"); + return 1; +} + diff --git a/makefile b/makefile index 55aacf9a..7869fbac 100644 --- a/makefile +++ b/makefile @@ -75,7 +75,7 @@ CORE_O= lapi.o lcode.o lctype.o ldebug.o ldo.o ldump.o lfunc.o lgc.o llex.o \ ltm.o lundump.o lvm.o lzio.o ltests.o AUX_O= lauxlib.o LIB_O= lbaselib.o ldblib.o liolib.o lmathlib.o loslib.o ltablib.o lstrlib.o \ - lbitlib.o loadlib.o lcorolib.o linit.o + lutf8lib.o lbitlib.o loadlib.o lcorolib.o linit.o LUA_T= lua LUA_O= lua.o @@ -153,7 +153,7 @@ lgc.o: lgc.c lua.h luaconf.h ldebug.h lstate.h lobject.h llimits.h ltm.h \ linit.o: linit.c lua.h luaconf.h lualib.h lauxlib.h liolib.o: liolib.c lua.h luaconf.h lauxlib.h lualib.h llex.o: llex.c lua.h luaconf.h lctype.h llimits.h ldo.h lobject.h \ - lstate.h ltm.h lzio.h lmem.h llex.h lparser.h lstring.h lgc.h ltable.h + lstate.h ltm.h lzio.h lmem.h lgc.h llex.h lparser.h lstring.h ltable.h lmathlib.o: lmathlib.c lua.h luaconf.h lauxlib.h lualib.h lmem.o: lmem.c lua.h luaconf.h ldebug.h lstate.h lobject.h llimits.h \ ltm.h lzio.h lmem.h ldo.h lgc.h @@ -168,8 +168,8 @@ lparser.o: lparser.c lua.h luaconf.h lcode.h llex.h lobject.h llimits.h \ lstate.o: lstate.c lua.h luaconf.h lapi.h llimits.h lstate.h lobject.h \ ltm.h lzio.h lmem.h ldebug.h ldo.h lfunc.h lgc.h llex.h lstring.h \ ltable.h -lstring.o: lstring.c lua.h luaconf.h lmem.h llimits.h lobject.h lstate.h \ - ltm.h lzio.h lstring.h lgc.h +lstring.o: lstring.c lua.h luaconf.h ldebug.h lstate.h lobject.h \ + llimits.h ltm.h lzio.h lmem.h ldo.h lstring.h lgc.h lstrlib.o: lstrlib.c lua.h luaconf.h lauxlib.h lualib.h ltable.o: ltable.c lua.h luaconf.h ldebug.h lstate.h lobject.h llimits.h \ ltm.h lzio.h lmem.h ldo.h lgc.h lstring.h ltable.h lvm.h @@ -182,6 +182,7 @@ ltm.o: ltm.c lua.h luaconf.h ldebug.h lstate.h lobject.h llimits.h ltm.h \ lua.o: lua.c lua.h luaconf.h lauxlib.h lualib.h lundump.o: lundump.c lua.h luaconf.h ldebug.h lstate.h lobject.h \ llimits.h ltm.h lzio.h lmem.h ldo.h lfunc.h lstring.h lgc.h lundump.h +lutf8lib.o: lutf8lib.c lua.h luaconf.h lauxlib.h lvm.o: lvm.c lua.h luaconf.h ldebug.h lstate.h lobject.h llimits.h ltm.h \ lzio.h lmem.h ldo.h lfunc.h lgc.h lopcodes.h lstring.h ltable.h lvm.h lzio.o: lzio.c lua.h luaconf.h llimits.h lmem.h lstate.h lobject.h ltm.h \