Merge #7892: Add full UTF-8 support to RPC

7982fce doc: Mention full UTF-8 support in release notes (Wladimir J. van der Laan)
6bbb4ef test: test utf-8 for labels in wallet (Wladimir J. van der Laan)
a406fcb test: add ensure_ascii setting to AuthServiceProxy (Wladimir J. van der Laan)
60ab9b2 Squashed 'src/univalue/' changes from 2740c4f..f32df99 (Wladimir J. van der Laan)
This commit is contained in:
Wladimir J. van der Laan 2016-06-16 12:08:18 +02:00
commit 9c3d0fab36
No known key found for this signature in database
GPG Key ID: 74810B012346C9A6
14 changed files with 209 additions and 44 deletions

View File

@ -43,6 +43,11 @@ RPC low-level changes
32-bit and 64-bit platforms, and the txids were missing in the hashed data. This has been
fixed, but this means that the output will be different than from previous versions.
- Full UTF-8 support in the RPC API. Non-ASCII characters in, for example,
wallet labels have always been malformed because they weren't taken into account
properly in JSON RPC processing. This is no longer the case. This also affects
the GUI debug console.
C++11 and Python 3
-------------------

View File

@ -67,9 +67,11 @@ def EncodeDecimal(o):
class AuthServiceProxy(object):
__id_count = 0
def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None):
# ensure_ascii: escape unicode as \uXXXX, passed to json.dumps
def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None, ensure_ascii=True):
self.__service_url = service_url
self._service_name = service_name
self.ensure_ascii = ensure_ascii # can be toggled on the fly by tests
self.__url = urlparse.urlparse(service_url)
if self.__url.port is None:
port = 80
@ -134,12 +136,12 @@ class AuthServiceProxy(object):
AuthServiceProxy.__id_count += 1
log.debug("-%s-> %s %s"%(AuthServiceProxy.__id_count, self._service_name,
json.dumps(args, default=EncodeDecimal)))
json.dumps(args, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
postdata = json.dumps({'version': '1.1',
'method': self._service_name,
'params': args,
'id': AuthServiceProxy.__id_count}, default=EncodeDecimal)
response = self._request('POST', self.__url.path, postdata)
'id': AuthServiceProxy.__id_count}, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
response = self._request('POST', self.__url.path, postdata.encode('utf-8'))
if response['error'] is not None:
raise JSONRPCException(response['error'])
elif 'result' not in response:
@ -149,9 +151,9 @@ class AuthServiceProxy(object):
return response['result']
def _batch(self, rpc_call_list):
postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal)
postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
log.debug("--> "+postdata)
return self._request('POST', self.__url.path, postdata)
return self._request('POST', self.__url.path, postdata.encode('utf-8'))
def _get_response(self):
http_response = self.__conn.getresponse()
@ -167,7 +169,7 @@ class AuthServiceProxy(object):
responsedata = http_response.read().decode('utf8')
response = json.loads(responsedata, parse_float=decimal.Decimal)
if "error" in response and response["error"] is None:
log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal)))
log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
else:
log.debug("<-- "+responsedata)
return response

View File

@ -309,6 +309,20 @@ class WalletTest (BitcoinTestFramework):
balance_nodes = [self.nodes[i].getbalance() for i in range(3)]
block_count = self.nodes[0].getblockcount()
# Check modes:
# - True: unicode escaped as \u....
# - False: unicode directly as UTF-8
for mode in [True, False]:
self.nodes[0].ensure_ascii = mode
# unicode check: Basic Multilingual Plane, Supplementary Plane respectively
for s in [u'рыба', u'𝅘𝅥𝅯']:
addr = self.nodes[0].getaccountaddress(s)
label = self.nodes[0].getaccount(addr)
assert_equal(label, s)
assert(s in self.nodes[0].listaccounts().keys())
self.nodes[0].ensure_ascii = True # restore to default
# maintenance tests
maintenance = [
'-rescan',
'-reindex',

View File

@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4
.INTERMEDIATE: $(GENBIN)
include_HEADERS = include/univalue.h
noinst_HEADERS = lib/univalue_escapes.h
noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h
lib_LTLIBRARIES = libunivalue.la
@ -73,6 +73,10 @@ TEST_FILES = \
$(TEST_DATA_DIR)/fail35.json \
$(TEST_DATA_DIR)/fail36.json \
$(TEST_DATA_DIR)/fail37.json \
$(TEST_DATA_DIR)/fail38.json \
$(TEST_DATA_DIR)/fail39.json \
$(TEST_DATA_DIR)/fail40.json \
$(TEST_DATA_DIR)/fail41.json \
$(TEST_DATA_DIR)/fail3.json \
$(TEST_DATA_DIR)/fail4.json \
$(TEST_DATA_DIR)/fail5.json \
@ -83,6 +87,7 @@ TEST_FILES = \
$(TEST_DATA_DIR)/pass1.json \
$(TEST_DATA_DIR)/pass2.json \
$(TEST_DATA_DIR)/pass3.json \
$(TEST_DATA_DIR)/round1.json
$(TEST_DATA_DIR)/round1.json \
$(TEST_DATA_DIR)/round2.json
EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS)

View File

@ -1,7 +1,7 @@
m4_define([libunivalue_major_version], [1])
m4_define([libunivalue_minor_version], [1])
m4_define([libunivalue_micro_version], [1])
m4_define([libunivalue_interface_age], [1])
m4_define([libunivalue_micro_version], [2])
m4_define([libunivalue_interface_age], [2])
# If you need a modifier for the version number.
# Normally empty, but can be used to make "fixup" releases.
m4_define([libunivalue_extraversion], [])
@ -14,7 +14,7 @@ m4_define([libunivalue_age], [m4_eval(libunivalue_binary_age - libunivalue_inter
m4_define([libunivalue_version], [libunivalue_major_version().libunivalue_minor_version().libunivalue_micro_version()libunivalue_extraversion()])
AC_INIT([univalue], [1.0.1],
AC_INIT([univalue], [1.0.2],
[http://github.com/jgarzik/univalue/])
dnl make the compilation flags quiet unless V=1 is used

View File

@ -6,6 +6,7 @@
#include <vector>
#include <stdio.h>
#include "univalue.h"
#include "univalue_utffilter.h"
using namespace std;
@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
raw++; // skip "
string valStr;
JSONUTF8StringFilter writer(valStr);
while (*raw) {
if (*raw < 0x20)
if ((unsigned char)*raw < 0x20)
return JTOK_ERR;
else if (*raw == '\\') {
raw++; // skip backslash
switch (*raw) {
case '"': valStr += "\""; break;
case '\\': valStr += "\\"; break;
case '/': valStr += "/"; break;
case 'b': valStr += "\b"; break;
case 'f': valStr += "\f"; break;
case 'n': valStr += "\n"; break;
case 'r': valStr += "\r"; break;
case 't': valStr += "\t"; break;
case '"': writer.push_back('\"'); break;
case '\\': writer.push_back('\\'); break;
case '/': writer.push_back('/'); break;
case 'b': writer.push_back('\b'); break;
case 'f': writer.push_back('\f'); break;
case 'n': writer.push_back('\n'); break;
case 'r': writer.push_back('\r'); break;
case 't': writer.push_back('\t'); break;
case 'u': {
unsigned int codepoint;
if (hatoui(raw + 1, raw + 1 + 4, codepoint) !=
raw + 1 + 4)
return JTOK_ERR;
if (codepoint <= 0x7f)
valStr.push_back((char)codepoint);
else if (codepoint <= 0x7FF) {
valStr.push_back((char)(0xC0 | (codepoint >> 6)));
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0xFFFF) {
valStr.push_back((char)(0xE0 | (codepoint >> 12)));
valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
}
writer.push_back_u(codepoint);
raw += 4;
break;
}
@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
}
else {
valStr += *raw;
writer.push_back(*raw);
raw++;
}
}
if (!writer.finalize())
return JTOK_ERR;
tokenVal = valStr;
consumed = (raw - rawStart);
return JTOK_STRING;

View File

@ -0,0 +1,119 @@
// Copyright 2016 Wladimir J. van der Laan
// Distributed under the MIT software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef UNIVALUE_UTFFILTER_H
#define UNIVALUE_UTFFILTER_H
#include <string>
/**
* Filter that generates and validates UTF-8, as well as collates UTF-16
* surrogate pairs as specified in RFC4627.
*/
class JSONUTF8StringFilter
{
public:
JSONUTF8StringFilter(std::string &s):
str(s), is_valid(true), codepoint(0), state(0), surpair(0)
{
}
// Write single 8-bit char (may be part of UTF-8 sequence)
void push_back(unsigned char ch)
{
if (state == 0) {
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
str.push_back(ch);
else if (ch < 0xc0) // Mid-sequence character, invalid in this state
is_valid = false;
else if (ch < 0xe0) { // Start of 2-byte sequence
codepoint = (ch & 0x1f) << 6;
state = 6;
} else if (ch < 0xf0) { // Start of 3-byte sequence
codepoint = (ch & 0x0f) << 12;
state = 12;
} else if (ch < 0xf8) { // Start of 4-byte sequence
codepoint = (ch & 0x07) << 18;
state = 18;
} else // Reserved, invalid
is_valid = false;
} else {
if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
is_valid = false;
state -= 6;
codepoint |= (ch & 0x3f) << state;
if (state == 0)
push_back_u(codepoint);
}
}
// Write codepoint directly, possibly collating surrogate pairs
void push_back_u(unsigned int codepoint)
{
if (state) // Only accept full codepoints in open state
is_valid = false;
if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair
if (surpair) // Two subsequent surrogate pair openers - fail
is_valid = false;
else
surpair = codepoint;
} else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair
if (surpair) { // Open surrogate pair, expect second half
// Compute code point from UTF-16 surrogate pair
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00));
surpair = 0;
} else // Second half doesn't follow a first half - fail
is_valid = false;
} else {
if (surpair) // First half of surrogate pair not followed by second - fail
is_valid = false;
else
append_codepoint(codepoint);
}
}
// Check that we're in a state where the string can be ended
// No open sequences, no open surrogate pairs, etc
bool finalize()
{
if (state || surpair)
is_valid = false;
return is_valid;
}
private:
std::string &str;
bool is_valid;
// Current UTF-8 decoding state
unsigned int codepoint;
int state; // Top bit to be filled in for next UTF-8 byte, or 0
// Keep track of the following state to handle the following section of
// RFC4627:
//
// To escape an extended character that is not in the Basic Multilingual
// Plane, the character is represented as a twelve-character sequence,
// encoding the UTF-16 surrogate pair. So, for example, a string
// containing only the G clef character (U+1D11E) may be represented as
// "\uD834\uDD1E".
//
// Two subsequent \u.... may have to be replaced with one actual codepoint.
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0
void append_codepoint(unsigned int codepoint)
{
if (codepoint <= 0x7f)
str.push_back((char)codepoint);
else if (codepoint <= 0x7FF) {
str.push_back((char)(0xC0 | (codepoint >> 6)));
str.push_back((char)(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0xFFFF) {
str.push_back((char)(0xE0 | (codepoint >> 12)));
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
str.push_back((char)(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0x1FFFFF) {
str.push_back((char)(0xF0 | (codepoint >> 18)));
str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F)));
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
str.push_back((char)(0x80 | (codepoint & 0x3F)));
}
}
};
#endif

View File

@ -8,8 +8,6 @@
#include "univalue.h"
#include "univalue_escapes.h"
// TODO: Using UTF8
using namespace std;
static string json_escape(const string& inS)
@ -23,15 +21,8 @@ static string json_escape(const string& inS)
if (escStr)
outS += escStr;
else if (ch < 0x80)
else
outS += ch;
else { // TODO handle UTF-8 properly
char tmpesc[16];
sprintf(tmpesc, "\\u%04x", ch);
outS += tmpesc;
}
}
return outS;

View File

@ -0,0 +1 @@
["\ud834"]

View File

@ -0,0 +1 @@
["\udd61"]

View File

@ -0,0 +1 @@
["揣。"]

View File

@ -0,0 +1 @@
["<22><><EFBFBD>"]

View File

@ -0,0 +1 @@
["a§■𐎒𝅘𝅥𝅯"]

View File

@ -22,6 +22,7 @@ string srcdir(JSON_TEST_SRC);
static bool test_failed = false;
#define d_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", filename.c_str()); } }
#define f_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", __func__); } }
static std::string rtrim(std::string s)
{
@ -108,6 +109,10 @@ static const char *filenames[] = {
"fail35.json",
"fail36.json",
"fail37.json",
"fail38.json", // invalid unicode: only first half of surrogate pair
"fail39.json", // invalid unicode: only second half of surrogate pair
"fail40.json", // invalid unicode: broken UTF-8
"fail41.json", // invalid unicode: unfinished UTF-8
"fail3.json",
"fail4.json", // extra comma
"fail5.json",
@ -119,14 +124,40 @@ static const char *filenames[] = {
"pass2.json",
"pass3.json",
"round1.json", // round-trip test
"round2.json", // unicode
};
// Test \u handling
void unescape_unicode_test()
{
UniValue val;
bool testResult;
// Escaped ASCII (quote)
testResult = val.read("[\"\\u0022\"]");
f_assert(testResult);
f_assert(val[0].get_str() == "\"");
// Escaped Basic Plane character, two-byte UTF-8
testResult = val.read("[\"\\u0191\"]");
f_assert(testResult);
f_assert(val[0].get_str() == "\xc6\x91");
// Escaped Basic Plane character, three-byte UTF-8
testResult = val.read("[\"\\u2191\"]");
f_assert(testResult);
f_assert(val[0].get_str() == "\xe2\x86\x91");
// Escaped Supplementary Plane character U+1d161
testResult = val.read("[\"\\ud834\\udd61\"]");
f_assert(testResult);
f_assert(val[0].get_str() == "\xf0\x9d\x85\xa1");
}
int main (int argc, char *argv[])
{
for (unsigned int fidx = 0; fidx < ARRAY_SIZE(filenames); fidx++) {
runtest_file(filenames[fidx]);
}
unescape_unicode_test();
return test_failed ? 1 : 0;
}