|
9 | 9 | )
|
10 | 10 | from .utils import (
|
11 | 11 | is_accentuated,
|
| 12 | + is_arabic, |
| 13 | + is_arabic_isolated_form, |
12 | 14 | is_case_variable,
|
13 | 15 | is_cjk,
|
14 | 16 | is_emoticon,
|
@@ -127,8 +129,9 @@ def reset(self) -> None: # pragma: no cover
|
127 | 129 |
|
128 | 130 | @property
|
129 | 131 | def ratio(self) -> float:
|
130 |
| - if self._character_count == 0 or self._character_count < 8: |
| 132 | + if self._character_count < 8: |
131 | 133 | return 0.0
|
| 134 | + |
132 | 135 | ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
133 | 136 | return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
134 | 137 |
|
@@ -455,6 +458,34 @@ def ratio(self) -> float:
|
455 | 458 | return self._successive_upper_lower_count_final / self._character_count
|
456 | 459 |
|
457 | 460 |
|
| 461 | +class ArabicIsolatedFormPlugin(MessDetectorPlugin): |
| 462 | + def __init__(self) -> None: |
| 463 | + self._character_count: int = 0 |
| 464 | + self._isolated_form_count: int = 0 |
| 465 | + |
| 466 | + def reset(self) -> None: # pragma: no cover |
| 467 | + self._character_count = 0 |
| 468 | + self._isolated_form_count = 0 |
| 469 | + |
| 470 | + def eligible(self, character: str) -> bool: |
| 471 | + return is_arabic(character) |
| 472 | + |
| 473 | + def feed(self, character: str) -> None: |
| 474 | + self._character_count += 1 |
| 475 | + |
| 476 | + if is_arabic_isolated_form(character): |
| 477 | + self._isolated_form_count += 1 |
| 478 | + |
| 479 | + @property |
| 480 | + def ratio(self) -> float: |
| 481 | + if self._character_count < 8: |
| 482 | + return 0.0 |
| 483 | + |
| 484 | + isolated_form_usage: float = self._isolated_form_count / self._character_count |
| 485 | + |
| 486 | + return isolated_form_usage |
| 487 | + |
| 488 | + |
458 | 489 | @lru_cache(maxsize=1024)
|
459 | 490 | def is_suspiciously_successive_range(
|
460 | 491 | unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
|
0 commit comments