From b0810c51c3f075cc8a309bfb3c1714ac42b0f020 Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Thu, 11 Apr 2019 11:29:16 -0300 Subject: [PATCH] Small optimizations in 'string.gsub' Avoid creating extra strings when possible: - avoid creating new resulting string when subject was not modified (instead, return the subject itself); - avoid creating strings representing the captured substrings when handling replacements like '%1' (instead, add the substring directly to the buffer). --- lstrlib.c | 130 ++++++++++++++++++++++++++++++++------------------ testes/gc.lua | 2 +- testes/pm.lua | 30 ++++++++++++ 3 files changed, 115 insertions(+), 47 deletions(-) diff --git a/lstrlib.c b/lstrlib.c index 6230cd0c..53ed80a3 100644 --- a/lstrlib.c +++ b/lstrlib.c @@ -660,25 +660,46 @@ static const char *lmemfind (const char *s1, size_t l1, } -static void push_onecapture (MatchState *ms, int i, const char *s, - const char *e) { +/* +** get information about the i-th capture. If there are no captures +** and 'i==0', return information about the whole match, which +** is the range 's'..'e'. If the capture is a string, return +** its length and put its address in '*cap'. If it is an integer +** (a position), push it on the stack and return CAP_POSITION. +*/ +static size_t get_onecapture (MatchState *ms, int i, const char *s, + const char *e, const char **cap) { if (i >= ms->level) { - if (i == 0) /* ms->level == 0, too */ - lua_pushlstring(ms->L, s, e - s); /* add whole match */ - else + if (i != 0) luaL_error(ms->L, "invalid capture index %%%d", i + 1); + *cap = s; + return e - s; } else { - ptrdiff_t l = ms->capture[i].len; - if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); - if (l == CAP_POSITION) + ptrdiff_t capl = ms->capture[i].len; + *cap = ms->capture[i].init; + if (capl == CAP_UNFINISHED) + luaL_error(ms->L, "unfinished capture"); + else if (capl == CAP_POSITION) lua_pushinteger(ms->L, (ms->capture[i].init - ms->src_init) + 1); - else - lua_pushlstring(ms->L, ms->capture[i].init, l); + return capl; } } +/* +** Push the i-th capture on the stack. +*/ +static void push_onecapture (MatchState *ms, int i, const char *s, + const char *e) { + const char *cap; + ptrdiff_t l = get_onecapture(ms, i, s, e, &cap); + if (l != CAP_POSITION) + lua_pushlstring(ms->L, cap, l); + /* else position was already pushed */ +} + + static int push_captures (MatchState *ms, const char *s, const char *e) { int i; int nlevels = (ms->level == 0 && s) ? 1 : ms->level; @@ -817,60 +838,72 @@ static int gmatch (lua_State *L) { static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) { - size_t l, i; + size_t l; lua_State *L = ms->L; const char *news = lua_tolstring(L, 3, &l); - for (i = 0; i < l; i++) { - if (news[i] != L_ESC) - luaL_addchar(b, news[i]); - else { - i++; /* skip ESC */ - if (!isdigit(uchar(news[i]))) { - if (news[i] != L_ESC) - luaL_error(L, "invalid use of '%c' in replacement string", L_ESC); - luaL_addchar(b, news[i]); - } - else if (news[i] == '0') - luaL_addlstring(b, s, e - s); - else { - push_onecapture(ms, news[i] - '1', s, e); - luaL_tolstring(L, -1, NULL); /* if number, convert it to string */ - lua_remove(L, -2); /* remove original value */ - luaL_addvalue(b); /* add capture to accumulated result */ - } + const char *p; + while ((p = (char *)memchr(news, L_ESC, l)) != NULL) { + luaL_addlstring(b, news, p - news); + p++; /* skip ESC */ + if (*p == L_ESC) /* '%%' */ + luaL_addchar(b, *p); + else if (*p == '0') /* '%0' */ + luaL_addlstring(b, s, e - s); + else if (isdigit(uchar(*p))) { /* '%n' */ + const char *cap; + ptrdiff_t resl = get_onecapture(ms, *p - '1', s, e, &cap); + if (resl == CAP_POSITION) + luaL_addvalue(b); /* add position to accumulated result */ + else + luaL_addlstring(b, cap, resl); } + else + luaL_error(L, "invalid use of '%c' in replacement string", L_ESC); + l -= p + 1 - news; + news = p + 1; } + luaL_addlstring(b, news, l); } -static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, - const char *e, int tr) { +/* +** Add the replacement value to the string buffer 'b'. +** Return true if the original string was changed. (Function calls and +** table indexing resulting in nil or false do not change the subject.) +*/ +static int add_value (MatchState *ms, luaL_Buffer *b, const char *s, + const char *e, int tr) { lua_State *L = ms->L; switch (tr) { - case LUA_TFUNCTION: { + case LUA_TFUNCTION: { /* call the function */ int n; - lua_pushvalue(L, 3); - n = push_captures(ms, s, e); - lua_call(L, n, 1); + lua_pushvalue(L, 3); /* push the function */ + n = push_captures(ms, s, e); /* all captures as arguments */ + lua_call(L, n, 1); /* call it */ break; } - case LUA_TTABLE: { - push_onecapture(ms, 0, s, e); + case LUA_TTABLE: { /* index the table */ + push_onecapture(ms, 0, s, e); /* first capture is the index */ lua_gettable(L, 3); break; } default: { /* LUA_TNUMBER or LUA_TSTRING */ - add_s(ms, b, s, e); - return; + add_s(ms, b, s, e); /* add value to the buffer */ + return 1; /* something changed */ } } if (!lua_toboolean(L, -1)) { /* nil or false? */ - lua_pop(L, 1); - lua_pushlstring(L, s, e - s); /* keep original text */ + lua_pop(L, 1); /* remove value */ + luaL_addlstring(b, s, e - s); /* keep original text */ + return 0; /* no changes */ } else if (!lua_isstring(L, -1)) - luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); - luaL_addvalue(b); /* add result to accumulator */ + return luaL_error(L, "invalid replacement value (a %s)", + luaL_typename(L, -1)); + else { + luaL_addvalue(b); /* add result to accumulator */ + return 1; /* something changed */ + } } @@ -883,6 +916,7 @@ static int str_gsub (lua_State *L) { lua_Integer max_s = luaL_optinteger(L, 4, srcl + 1); /* max replacements */ int anchor = (*p == '^'); lua_Integer n = 0; /* replacement count */ + int changed = 0; /* change flag */ MatchState ms; luaL_Buffer b; luaL_argexpected(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || @@ -898,7 +932,7 @@ static int str_gsub (lua_State *L) { reprepstate(&ms); /* (re)prepare state for new match */ if ((e = match(&ms, src, p)) != NULL && e != lastmatch) { /* match? */ n++; - add_value(&ms, &b, src, e, tr); /* add replacement to buffer */ + changed = add_value(&ms, &b, src, e, tr) | changed; src = lastmatch = e; } else if (src < ms.src_end) /* otherwise, skip one character */ @@ -906,8 +940,12 @@ static int str_gsub (lua_State *L) { else break; /* end of subject */ if (anchor) break; } - luaL_addlstring(&b, src, ms.src_end-src); - luaL_pushresult(&b); + if (!changed) /* no changes? */ + lua_pushvalue(L, 1); /* return original string */ + else { /* something changed */ + luaL_addlstring(&b, src, ms.src_end-src); + luaL_pushresult(&b); /* create and return new string */ + } lua_pushinteger(L, n); /* number of substitutions */ return 2; } diff --git a/testes/gc.lua b/testes/gc.lua index 91e78a48..6d24e0d8 100644 --- a/testes/gc.lua +++ b/testes/gc.lua @@ -113,7 +113,7 @@ do contCreate = 0 while contCreate <= limit do a = contCreate .. "b"; - a = string.gsub(a, '(%d%d*)', string.upper) + a = string.gsub(a, '(%d%d*)', "%1 %1") a = "a" contCreate = contCreate+1 end diff --git a/testes/pm.lua b/testes/pm.lua index 8cc8772e..4d87fad2 100644 --- a/testes/pm.lua +++ b/testes/pm.lua @@ -387,5 +387,35 @@ assert(string.match("abc\0\0\0", "%\0%\0?") == "\0\0") assert(string.find("abc\0\0","\0.") == 4) assert(string.find("abcx\0\0abc\0abc","x\0\0abc\0a.") == 4) + +do -- test reuse of original string in gsub + local s = string.rep("a", 100) + local r = string.gsub(s, "b", "c") -- no match + assert(string.format("%p", s) == string.format("%p", r)) + + r = string.gsub(s, ".", {x = "y"}) -- no substitutions + assert(string.format("%p", s) == string.format("%p", r)) + + local count = 0 + r = string.gsub(s, ".", function (x) + assert(x == "a") + count = count + 1 + return nil -- no substitution + end) + r = string.gsub(r, ".", {b = 'x'}) -- "a" is not a key; no subst. + assert(count == 100) + assert(string.format("%p", s) == string.format("%p", r)) + + count = 0 + r = string.gsub(s, ".", function (x) + assert(x == "a") + count = count + 1 + return x -- substitution... + end) + assert(count == 100) + -- no reuse in this case + assert(r == s and string.format("%p", s) ~= string.format("%p", r)) +end + print('OK')