Skip to content

Commit 5226e8f

Browse files
committed
🐛 Regression on some detection case showcased in the documentation (#371)
and added noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife, thanks!)
1 parent a4b9b01 commit 5226e8f

File tree

6 files changed

+69
-2
lines changed

6 files changed

+69
-2
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

77
### Fixed
88
- Unintentional memory usage regression when using large payload that match several encoding (#376)
9+
- Regression on some detection case showcased in the documentation (#371)
910

11+
### Added
12+
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
1013

1114
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
1215

charset_normalizer/md.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
)
1010
from .utils import (
1111
is_accentuated,
12+
is_arabic,
13+
is_arabic_isolated_form,
1214
is_case_variable,
1315
is_cjk,
1416
is_emoticon,
@@ -127,8 +129,9 @@ def reset(self) -> None: # pragma: no cover
127129

128130
@property
129131
def ratio(self) -> float:
130-
if self._character_count == 0 or self._character_count < 8:
132+
if self._character_count < 8:
131133
return 0.0
134+
132135
ratio_of_accentuation: float = self._accentuated_count / self._character_count
133136
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
134137

@@ -455,6 +458,34 @@ def ratio(self) -> float:
455458
return self._successive_upper_lower_count_final / self._character_count
456459

457460

461+
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
462+
def __init__(self) -> None:
463+
self._character_count: int = 0
464+
self._isolated_form_count: int = 0
465+
466+
def reset(self) -> None: # pragma: no cover
467+
self._character_count = 0
468+
self._isolated_form_count = 0
469+
470+
def eligible(self, character: str) -> bool:
471+
return is_arabic(character)
472+
473+
def feed(self, character: str) -> None:
474+
self._character_count += 1
475+
476+
if is_arabic_isolated_form(character):
477+
self._isolated_form_count += 1
478+
479+
@property
480+
def ratio(self) -> float:
481+
if self._character_count < 8:
482+
return 0.0
483+
484+
isolated_form_usage: float = self._isolated_form_count / self._character_count
485+
486+
return isolated_form_usage
487+
488+
458489
@lru_cache(maxsize=1024)
459490
def is_suspiciously_successive_range(
460491
unicode_range_a: Optional[str], unicode_range_b: Optional[str]

charset_normalizer/utils.py

+22
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def is_accentuated(character: str) -> bool:
3232
or "WITH DIAERESIS" in description
3333
or "WITH CIRCUMFLEX" in description
3434
or "WITH TILDE" in description
35+
or "WITH MACRON" in description
36+
or "WITH RING ABOVE" in description
3537
)
3638

3739

@@ -174,6 +176,26 @@ def is_thai(character: str) -> bool:
174176
return "THAI" in character_name
175177

176178

179+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
180+
def is_arabic(character: str) -> bool:
181+
try:
182+
character_name = unicodedata.name(character)
183+
except ValueError:
184+
return False
185+
186+
return "ARABIC" in character_name
187+
188+
189+
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
190+
def is_arabic_isolated_form(character: str) -> bool:
191+
try:
192+
character_name = unicodedata.name(character)
193+
except ValueError:
194+
return False
195+
196+
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
197+
198+
177199
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
178200
def is_unicode_range_secondary(range_name: str) -> bool:
179201
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)

tests/test_base_detection.py

+8
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,11 @@ def test_alphabets_property():
115115
assert "Basic Latin" in best_guess.alphabets
116116
assert "Emoticons range(Emoji)" in best_guess.alphabets
117117
assert best_guess.alphabets.count("Basic Latin") == 1
118+
119+
120+
def test_doc_example_short_cp1251():
121+
best_guess = from_bytes(
122+
'Bсеки човек има право на образование.'.encode('cp1251')
123+
).best()
124+
125+
assert best_guess.encoding == "cp1251"

tests/test_large_payload.py

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def test_large_payload_u8_sig_basic_entry():
1212
assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!"
1313
assert best_guess.bom is True, "SIG/BOM property should be True"
1414
assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
15+
assert best_guess._string is not None, "str should be decoded before direct access (sig available)"
1516

1617

1718
def test_large_payload_ascii_basic_entry():
@@ -22,6 +23,7 @@ def test_large_payload_ascii_basic_entry():
2223
assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!"
2324
assert best_guess.bom is False, "SIG/BOM property should be False"
2425
assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw"
26+
assert best_guess._string is None, "str should not be decoded until direct access"
2527

2628

2729
def test_misleading_large_sequence():
@@ -32,5 +34,6 @@ def test_misleading_large_sequence():
3234
assert len(guesses) > 0
3335
match = guesses.best()
3436
assert match is not None
37+
assert match._string is not None, "str should be cached as only match"
3538
assert match.encoding == 'utf_8'
3639
assert str(match) is not None

tests/test_mess_detection.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.),
1313
("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5),
1414
("<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>", 0.01, 0.5),
15-
("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† Ø§Ų„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…Ø§ ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊ؈) ŲˆØ§Ų„ØŪØ§ØŠŲ…""", 0.8, 2.0),
15+
("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† Ø§Ų„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…Ø§ ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊ؈) ŲˆØ§Ų„ØŪØ§ØŠŲ…""", 0.8, 3.0),
1616
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
1717
("""[email protected] ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0)
1818

0 commit comments

Comments
 (0)