Update test_unicodedata from CPython 3.11.2 by dalinaum · Pull Request #4678 · RustPython/RustPython

@@ -1,32 +1,32 @@ """ Test script for the unicodedata module. """ Tests for the unicodedata module.
Written by Marc-Andre Lemburg (mal@lemburg.com).
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""
import hashlib from http.client import HTTPException import sys import unicodedata import unittest import hashlib from test.support import script_helper
encoding = 'utf-8' errors = 'surrogatepass' from test.support import (open_urlresource, requires_resource, script_helper, cpython_only, check_disallow_instantiation, ResourceDenied)

### Run tests
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1' expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326'
# TODO: RUSTPYTHON @unittest.expectedFailure @requires_resource('cpu') def test_method_checksum(self): h = hashlib.sha1() for i in range(0x10000): for i in range(sys.maxunicode + 1): char = chr(i) data = [ # Predicates (single char) @@ -63,33 +63,26 @@ def test_method_checksum(self): (char + 'ABC').title(),
] h.update(''.join(data).encode(encoding, errors)) h.update(''.join(data).encode('utf-8', 'surrogatepass')) result = h.hexdigest() self.assertEqual(result, self.expectedchecksum)
class UnicodeDatabaseTest(unittest.TestCase):
def setUp(self): # In case unicodedata is not available, this will raise an ImportError, # but the other test cases will still be run import unicodedata self.db = unicodedata
def tearDown(self): del self.db db = unicodedata
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# Update this if the database changes. Make sure to do a full rebuild # (e.g. 'make distclean && make') to get the correct checksum. expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652' expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370' # TODO: RUSTPYTHON @unittest.expectedFailure @requires_resource('cpu') def test_function_checksum(self): data = [] h = hashlib.sha1()
for i in range(0x10000): for i in range(sys.maxunicode + 1): char = chr(i) data = [ # Properties @@ -106,6 +99,15 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum)
# TODO: RUSTPYTHON @unittest.expectedFailure @requires_resource('cpu') def test_name_inverse_lookup(self): for i in range(sys.maxunicode + 1): char = chr(i) if looked_name := self.db.name(char, None): self.assertEqual(self.db.lookup(looked_name), char)
# TODO: RUSTPYTHON @unittest.expectedFailure def test_digit(self): @@ -201,15 +203,8 @@ def test_combining(self): self.assertRaises(TypeError, self.db.combining) self.assertRaises(TypeError, self.db.combining, 'xx')
def test_normalize(self): self.assertRaises(TypeError, self.db.normalize) self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx') self.assertEqual(self.db.normalize('NFKC', ''), '') # The rest can be found in test_normalization.py # which requires an external file.
def test_pr29(self): # http://www.unicode.org/review/pr-29.html # https://www.unicode.org/review/pr-29.html # See issues #1054943 and #10254. composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161", 'Li\u030dt-s\u1e73\u0301', @@ -240,9 +235,6 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
# For tests of unicodedata.is_normalized / self.db.is_normalized , # see test_normalization.py .
def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') @@ -265,6 +257,11 @@ def test_east_asian_width_9_0_changes(self):
class UnicodeMiscTest(UnicodeDatabaseTest):
@cpython_only def test_disallow_instantiation(self): # Ensure that the type disallows instantiation (bpo-43916) check_disallow_instantiation(self, unicodedata.UCD)
# TODO: RUSTPYTHON @unittest.expectedFailure def test_failed_import_during_compiling(self): @@ -363,5 +360,103 @@ def test_linebreak_7643(self): self.assertEqual(len(lines), 1, r"\u%.4x should not be a linebreak" % i)
class NormalizationTest(unittest.TestCase): @staticmethod def check_version(testfile): hdr = testfile.readline() return unicodedata.unidata_version in hdr
@staticmethod def unistr(data): data = [int(x, 16) for x in data.split(" ")] return "".join([chr(x) for x in data])
@requires_resource('network') def test_normalization(self): TESTDATAFILE = "NormalizationTest.txt" TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
# Hit the exception early try: testdata = open_urlresource(TESTDATAURL, encoding="utf-8", check=self.check_version) except PermissionError: self.skipTest(f"Permission error when downloading {TESTDATAURL} " f"into the test data directory") except (OSError, HTTPException) as exc: self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
with testdata: self.run_normalization_tests(testdata)
def run_normalization_tests(self, testdata): part = None part1_data = {}
def NFC(str): return unicodedata.normalize("NFC", str)
def NFKC(str): return unicodedata.normalize("NFKC", str)
def NFD(str): return unicodedata.normalize("NFD", str)
def NFKD(str): return unicodedata.normalize("NFKD", str)
for line in testdata: if '#' in line: line = line.split('#')[0] line = line.strip() if not line: continue if line.startswith("@Part"): part = line.split()[0] continue c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
# Perform tests self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) self.assertTrue(c4 == NFC(c4) == NFC(c5), line) self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) self.assertTrue(c5 == NFD(c4) == NFD(c5), line) self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ NFKC(c3) == NFKC(c4) == NFKC(c5), line) self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ NFKD(c3) == NFKD(c4) == NFKD(c5), line)
self.assertTrue(unicodedata.is_normalized("NFC", c2)) self.assertTrue(unicodedata.is_normalized("NFC", c4))
self.assertTrue(unicodedata.is_normalized("NFD", c3)) self.assertTrue(unicodedata.is_normalized("NFD", c5))
self.assertTrue(unicodedata.is_normalized("NFKC", c4)) self.assertTrue(unicodedata.is_normalized("NFKD", c5))
# Record part 1 data if part == "@Part1": part1_data[c1] = 1
# Perform tests for all other data for c in range(sys.maxunicode+1): X = chr(c) if X in part1_data: continue self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
def test_edge_cases(self): self.assertRaises(TypeError, unicodedata.normalize) self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx') self.assertEqual(unicodedata.normalize('NFKC', ''), '')
def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00')

if __name__ == "__main__": unittest.main()