bpo-32030: Add _Py_EncodeUTF8_surrogateescape() by vstinner · Pull Request #4960 · python/cpython
/* UTF-8 encoder using the surrogateescape error handler .
On success, return a pointer to a newly allocated character string (use PyMem_Free() to free the memory).
On encoding failure, return NULL and write the position of the invalid surrogate character into *error_pos (if error_pos is set).
On memory allocation failure, return NULL and write (size_t)-1 into *error_pos (if error_pos is set). */ char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos) { const Py_ssize_t max_char_size = 4; Py_ssize_t len = wcslen(text);
assert(len >= 0);
char *bytes; if (len <= PY_SSIZE_T_MAX / max_char_size - 1) { bytes = PyMem_Malloc((len + 1) * max_char_size); } else { bytes = NULL; } if (bytes == NULL) { if (error_pos != NULL) { *error_pos = (size_t)-1; } return NULL; }
char *p = bytes; Py_ssize_t i; for (i = 0; i < len;) { Py_UCS4 ch = text[i++];
if (ch < 0x80) { /* Encode ASCII */ *p++ = (char) ch;
} else if (ch < 0x0800) { /* Encode Latin-1 */ *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); } else if (Py_UNICODE_IS_SURROGATE(ch)) { /* surrogateescape error handler */ if (!(0xDC80 <= ch && ch <= 0xDCFF)) { if (error_pos != NULL) { *error_pos = (size_t)i - 1; } goto error; } *p++ = (char)(ch & 0xff); } else if (ch < 0x10000) { *p++ = (char)(0xe0 | (ch >> 12)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); } else { /* ch >= 0x10000 */ assert(ch <= MAX_UNICODE); /* Encode UCS4 Unicode ordinals */ *p++ = (char)(0xf0 | (ch >> 18)); *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); } } *p++ = '\0';
size_t final_size = (p - bytes); char *bytes2 = PyMem_Realloc(bytes, final_size); if (bytes2 == NULL) { if (error_pos != NULL) { *error_pos = (size_t)-1; } goto error; } return bytes2;
error: PyMem_Free(bytes); return NULL; }
/* Primary internal function which creates utf8 encoded bytes objects.
Allocation strategy: if the string is short, convert into a stack buffer