◐ Shell
reader mode source ↗
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
File filter
Conversations
Jump to
Diff view
Apply and reload
Show whitespace
Diff view
Apply and reload
301 changes: 215 additions & 86 deletions Cargo.lock
8 changes: 2 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -222,15 +222,11 @@ strum = "0.28"
strum_macros = "0.28"
syn = "2"
thiserror = "2.0"
unicode-casing = "0.1.1"
unic-char-property = "0.9.0"
unic-normal = "0.9.0"
unic-ucd-age = "0.9.0"
unic-ucd-bidi = "0.9.0"
unic-ucd-category = "0.9.0"
unic-ucd-ident = "0.9.0"
unicode_names2 = "2.0.0"
unicode-bidi-mirroring = "0.4"
widestring = "1.2.0"
windows-sys = "0.61.2"
wasm-bindgen = "0.2.106"
Expand Down
1 change: 1 addition & 0 deletions Lib/test/test_str.py
Original file line number Diff line number Diff line change
@@ -854,6 +854,7 @@ def test_isprintable(self):
self.assertTrue('\U0001F46F'.isprintable())
self.assertFalse('\U000E0020'.isprintable())

@support.requires_resource('cpu')
def test_isprintable_invariant(self):
for codepoint in range(sys.maxunicode + 1):
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ def test_issue10254(self):
b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b)

@unittest.expectedFailure # TODO: RUSTPYTHON; ? +
def test_issue29456(self):
# Fix #29456
u1176_str_a = '\u1100\u1176\u11a8'
Expand Down @@ -389,6 +388,7 @@ def unistr(data):
data = [int(x, 16) for x in data.split(" ")]
return "".join([chr(x) for x in data])

@requires_resource('network')
@requires_resource('cpu')
def test_normalization(self):
Expand Down
17 changes: 14 additions & 3 deletions crates/literal/src/char.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use unic_ucd_category::GeneralCategory;

/// According to python following categories aren't printable:
/// * Cc (Other, Control)
Expand All @@ -10,6 +10,17 @@ use unic_ucd_category::GeneralCategory;
/// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
/// * Zs (Separator, Space) other than ASCII space('\x20').
pub fn is_printable(c: char) -> bool {
let cat = GeneralCategory::of(c);
!(cat.is_other() || cat.is_separator())
}
7 changes: 2 additions & 5 deletions crates/stdlib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,10 @@ constant_time_eq = { workspace = true }
## unicode stuff
unicode_names2 = { workspace = true }
# update version all at the same time
unic-char-property = { workspace = true }
unic-normal = { workspace = true }
unic-ucd-bidi = { workspace = true }
unic-ucd-category = { workspace = true }
unic-ucd-age = { workspace = true }
ucd = "0.1.1"
unicode-bidi-mirroring = { workspace = true }

# compression
adler32 = "1.2.0"
Expand Down
102 changes: 62 additions & 40 deletions crates/stdlib/src/unicodedata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,19 @@ mod unicodedata {
builtins::{PyModule, PyStrRef},
function::OptionalArg,
};
use itertools::Itertools;
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType};
use unic_char_property::EnumeratedCharProperty;
use unic_normal::StrNormalForm;
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
use unic_ucd_bidi::BidiClass;
use unic_ucd_category::GeneralCategory;
use unicode_bidi_mirroring::is_mirroring;

pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py<PyModule>) -> PyResult<()> {
__module_exec(vm, module);
Expand Down Expand Up @@ -117,9 +121,9 @@ mod unicodedata {
.extract_char(character, vm)?
.map_or(GeneralCategory::Unassigned, |c| {
c.to_char()
.map_or(GeneralCategory::Surrogate, GeneralCategory::of)
})
.abbr_name()
.to_owned())
}

Expand Up @@ -165,8 +169,8 @@ mod unicodedata {
let bidi = match self.extract_char(character, vm)? {
Some(c) => c
.to_char()
.map_or(BidiClass::LeftToRight, BidiClass::of)
.abbr_name(),
None => "",
};
Ok(bidi)
Expand All @@ -182,18 +186,34 @@ mod unicodedata {
Ok(self
.extract_char(character, vm)?
.and_then(|c| c.to_char())
.map_or(EastAsianWidth::Neutral, |c| c.east_asian_width())
.abbr_name())
}

#[pymethod]
fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<Wtf8Buf> {
let text = unistr.as_wtf8();
let normalized_text = match form {
Nfc => text.map_utf8(|s| s.nfc()).collect(),
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
Nfd => text.map_utf8(|s| s.nfd()).collect(),
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
};
Ok(normalized_text)
}
@@ -202,10 +222,26 @@ mod unicodedata {
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
let text = unistr.as_wtf8();
let normalized: Wtf8Buf = match form {
Nfc => text.map_utf8(|s| s.nfc()).collect(),
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
Nfd => text.map_utf8(|s| s.nfd()).collect(),
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
};
Ok(text == &*normalized)
}
Expand All @@ -216,7 +252,8 @@ mod unicodedata {
Some(c) => {
if let Some(ch) = c.to_char() {
// Check if the character is mirrored in bidirectional text using Unicode standard
Ok(if is_mirroring(ch) { 1 } else { 0 })
} else {
Ok(0)
}
Expand All @@ -226,11 +263,13 @@ mod unicodedata {
}

#[pymethod]
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
Ok(self
.extract_char(character, vm)?
.and_then(|c| c.to_char())
.map_or(0, |ch| ch.canonical_combining_class() as i32))
}

#[pymethod]
Expand Down Expand Up @@ -339,23 +378,6 @@ mod unicodedata {
}
}

trait EastAsianWidthAbbrName {
fn abbr_name(&self) -> &'static str;
}

impl EastAsianWidthAbbrName for EastAsianWidth {
fn abbr_name(&self) -> &'static str {
match self {
Self::Narrow => "Na",
Self::Wide => "W",
Self::Neutral => "N",
Self::Ambiguous => "A",
Self::FullWidth => "F",
Self::HalfWidth => "H",
}
}
}

#[pyattr]
fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef<Ucd> {
Ucd {
31 changes: 22 additions & 9 deletions crates/vm/src/builtins/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ use rustpython_common::{
str::DeduceStrKind,
wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat},
};
use unic_ucd_bidi::BidiClass;
use unic_ucd_category::GeneralCategory;
use unic_ucd_ident::{is_xid_continue, is_xid_start};
use unicode_casing::CharExt;

impl<'a> TryFromBorrowedObject<'a> for String {
Expand Down Expand Up @@ -966,7 +967,9 @@ impl PyStr {
#[pymethod]
fn isdecimal(&self) -> bool {
!self.data.is_empty()
&& self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber)
}

fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
Expand Down Expand Up @@ -1091,11 +1094,17 @@ impl PyStr {

#[pymethod]
fn isspace(&self) -> bool {
use unic_ucd_bidi::bidi_class::abbr_names::*;
!self.data.is_empty()
&& self.char_all(|c| {
GeneralCategory::of(c) == GeneralCategory::SpaceSeparator
|| matches!(BidiClass::of(c), WS | B | S)
})
}

Expand Down Expand Up @@ -1355,9 +1364,13 @@ impl PyStr {
pub fn isidentifier(&self) -> bool {
let Some(s) = self.to_str() else { return false };
let mut chars = s.chars();
let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c));
// a string is not an identifier if it has whitespace or starts with a number
is_identifier_start && chars.all(is_xid_continue)
}

// https://docs.python.org/3/library/stdtypes.html#str.translate
Expand Down
Loading
Toggle all file notes Toggle all file annotations