Package pyarabic ::
Module araby
|
|
1
2
3 """
4 Arabic module
5 @author: Taha Zerrouki
6 @contact: taha dot zerrouki at gmail dot com
7 @copyright: Arabtechies, Arabeyes, Taha Zerrouki
8 @license: GPL
9 @date:2010/03/01
10 @version: 0.1
11 """
12 import re
13
14 if __name__ == "__main__":
15 import stack
16 else:
17 import pyarabic.stack as stack
18
19 COMMA = u'\u060C'
20 SEMICOLON = u'\u061B'
21 QUESTION = u'\u061F'
22 HAMZA = u'\u0621'
23 ALEF_MADDA = u'\u0622'
24 ALEF_HAMZA_ABOVE = u'\u0623'
25 WAW_HAMZA = u'\u0624'
26 ALEF_HAMZA_BELOW = u'\u0625'
27 YEH_HAMZA = u'\u0626'
28 ALEF = u'\u0627'
29 BEH = u'\u0628'
30 TEH_MARBUTA = u'\u0629'
31 TEH = u'\u062a'
32 THEH = u'\u062b'
33 JEEM = u'\u062c'
34 HAH = u'\u062d'
35 KHAH = u'\u062e'
36 DAL = u'\u062f'
37 THAL = u'\u0630'
38 REH = u'\u0631'
39 ZAIN = u'\u0632'
40 SEEN = u'\u0633'
41 SHEEN = u'\u0634'
42 SAD = u'\u0635'
43 DAD = u'\u0636'
44 TAH = u'\u0637'
45 ZAH = u'\u0638'
46 AIN = u'\u0639'
47 GHAIN = u'\u063a'
48 TATWEEL = u'\u0640'
49 FEH = u'\u0641'
50 QAF = u'\u0642'
51 KAF = u'\u0643'
52 LAM = u'\u0644'
53 MEEM = u'\u0645'
54 NOON = u'\u0646'
55 HEH = u'\u0647'
56 WAW = u'\u0648'
57 ALEF_MAKSURA = u'\u0649'
58 YEH = u'\u064a'
59 MADDA_ABOVE = u'\u0653'
60 HAMZA_ABOVE = u'\u0654'
61 HAMZA_BELOW = u'\u0655'
62 ZERO = u'\u0660'
63 ONE = u'\u0661'
64 TWO = u'\u0662'
65 THREE = u'\u0663'
66 FOUR = u'\u0664'
67 FIVE = u'\u0665'
68 SIX = u'\u0666'
69 SEVEN = u'\u0667'
70 EIGHT = u'\u0668'
71 NINE = u'\u0669'
72 PERCENT = u'\u066a'
73 DECIMAL = u'\u066b'
74 THOUSANDS = u'\u066c'
75 STAR = u'\u066d'
76 MINI_ALEF = u'\u0670'
77 ALEF_WASLA = u'\u0671'
78 FULL_STOP = u'\u06d4'
79 BYTE_ORDER_MARK = u'\ufeff'
80
81
82 FATHATAN = u'\u064b'
83 DAMMATAN = u'\u064c'
84 KASRATAN = u'\u064d'
85 FATHA = u'\u064e'
86 DAMMA = u'\u064f'
87 KASRA = u'\u0650'
88 SHADDA = u'\u0651'
89 SUKUN = u'\u0652'
90
91
92 SMALL_ALEF = u"\u0670"
93 SMALL_WAW = u"\u06E5"
94 SMALL_YEH = u"\u06E6"
95
96 LAM_ALEF = u'\ufefb'
97 LAM_ALEF_HAMZA_ABOVE = u'\ufef7'
98 LAM_ALEF_HAMZA_BELOW = u'\ufef9'
99 LAM_ALEF_MADDA_ABOVE = u'\ufef5'
100 SIMPLE_LAM_ALEF = u'\u0644\u0627'
101 SIMPLE_LAM_ALEF_HAMZA_ABOVE = u'\u0644\u0623'
102 SIMPLE_LAM_ALEF_HAMZA_BELOW = u'\u0644\u0625'
103 SIMPLE_LAM_ALEF_MADDA_ABOVE = u'\u0644\u0622'
104
105 LETTERS = u''.join([
106 ALEF, BEH, TEH, TEH_MARBUTA, THEH, JEEM, HAH, KHAH,
107 DAL, THAL, REH, ZAIN, SEEN, SHEEN, SAD, DAD, TAH, ZAH,
108 AIN, GHAIN, FEH, QAF, KAF, LAM, MEEM, NOON, HEH, WAW, YEH,
109 HAMZA, ALEF_MADDA, ALEF_HAMZA_ABOVE, WAW_HAMZA, ALEF_HAMZA_BELOW,
110 YEH_HAMZA,
111 ])
112
113 TASHKEEL = (FATHATAN, DAMMATAN, KASRATAN,
114 FATHA, DAMMA, KASRA,
115 SUKUN,
116 SHADDA)
117 HARAKAT = ( FATHATAN, DAMMATAN, KASRATAN,
118 FATHA, DAMMA, KASRA,
119 SUKUN
120 )
121 SHORTHARAKAT = ( FATHA, DAMMA, KASRA, SUKUN)
122
123 TANWIN = (FATHATAN, DAMMATAN, KASRATAN)
124
125 NOT_DEF_HARAKA = TATWEEL
126 LIGUATURES = (
127 LAM_ALEF,
128 LAM_ALEF_HAMZA_ABOVE,
129 LAM_ALEF_HAMZA_BELOW,
130 LAM_ALEF_MADDA_ABOVE,
131 )
132 HAMZAT = ( HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,
133 ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE,
134 )
135 ALEFAT = ( ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE,
136 ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA, SMALL_ALEF,
137 )
138 WEAK = ( ALEF, WAW, YEH, ALEF_MAKSURA)
139 YEHLIKE = ( YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH )
140
141 WAWLIKE = ( WAW, WAW_HAMZA, SMALL_WAW )
142 TEHLIKE = ( TEH, TEH_MARBUTA )
143
144 SMALL = ( SMALL_ALEF, SMALL_WAW, SMALL_YEH)
145 MOON = (HAMZA, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,
146 ALEF, BEH, JEEM, HAH, KHAH, AIN, GHAIN,
147 FEH, QAF, KAF, MEEM, HEH, WAW, YEH)
148
149 SUN = ( TEH, THEH, DAL, THAL, REH, ZAIN, SEEN, SHEEN,
150 SAD, DAD, TAH, ZAH, LAM, NOON, )
151
152 ALPHABETIC_ORDER = {
153 ALEF : 1,
154 BEH : 2,
155 TEH : 3,
156 TEH_MARBUTA : 3,
157 THEH : 4,
158 JEEM : 5,
159 HAH : 6,
160 KHAH : 7,
161 DAL : 8,
162 THAL : 9,
163 REH : 10,
164 ZAIN : 11,
165 SEEN : 12,
166 SHEEN : 13,
167 SAD : 14,
168 DAD : 15,
169 TAH : 16,
170 ZAH : 17,
171 AIN : 18,
172 GHAIN : 19,
173 FEH : 20,
174 QAF : 21,
175 KAF : 22,
176 LAM : 23,
177 MEEM : 24,
178 NOON : 25,
179 HEH : 26,
180 WAW : 27,
181 YEH : 28,
182 HAMZA : 29,
183
184 ALEF_MADDA : 29,
185 ALEF_HAMZA_ABOVE : 29,
186 WAW_HAMZA : 29,
187 ALEF_HAMZA_BELOW : 29,
188 YEH_HAMZA : 29,
189 }
190 NAMES = {
191 ALEF : u"ألف",
192 BEH : u"باء",
193 TEH : u'تاء' ,
194 TEH_MARBUTA : u'تاء مربوطة' ,
195 THEH : u'ثاء' ,
196 JEEM : u'جيم' ,
197 HAH : u'حاء' ,
198 KHAH : u'خاء' ,
199 DAL : u'دال' ,
200 THAL : u'ذال' ,
201 REH : u'راء' ,
202 ZAIN : u'زاي' ,
203 SEEN : u'سين' ,
204 SHEEN : u'شين' ,
205 SAD : u'صاد' ,
206 DAD : u'ضاد' ,
207 TAH : u'طاء' ,
208 ZAH : u'ظاء' ,
209 AIN : u'عين' ,
210 GHAIN : u'غين' ,
211 FEH : u'فاء' ,
212 QAF : u'قاف' ,
213 KAF : u'كاف' ,
214 LAM : u'لام' ,
215 MEEM : u'ميم' ,
216 NOON : u'نون' ,
217 HEH : u'هاء' ,
218 WAW : u'واو' ,
219 YEH : u'ياء' ,
220 HAMZA : u'همزة' ,
221
222 TATWEEL : u'تطويل' ,
223 ALEF_MADDA : u'ألف ممدودة' ,
224 ALEF_MAKSURA : u'ألف مقصورة' ,
225 ALEF_HAMZA_ABOVE : u'همزة على الألف' ,
226 WAW_HAMZA : u'همزة على الواو' ,
227 ALEF_HAMZA_BELOW : u'همزة تحت الألف' ,
228 YEH_HAMZA : u'همزة على الياء' ,
229 FATHATAN : u'فتحتان',
230 DAMMATAN : u'ضمتان',
231 KASRATAN : u'كسرتان',
232 FATHA : u'فتحة',
233 DAMMA : u'ضمة',
234 KASRA : u'كسرة',
235 SHADDA : u'شدة',
236 SUKUN : u'سكون',
237 }
238
239
240 HARAKAT_PATTERN = re.compile(ur"["+u"".join(HARAKAT)+u"]", re.UNICODE)
241
242 LASTHARAKA_PATTERN = \
243 re.compile(ur"[%s]$|[%s]"%(u"".join(HARAKAT), u''.join(TANWIN)), re.UNICODE)
244
245 SHORTHARAKAT_PATTERN = \
246 re.compile(ur"["+u"".join(SHORTHARAKAT)+u"]", re.UNICODE)
247
248
249 TASHKEEL_PATTERN = re.compile(ur"["+u"".join(TASHKEEL)+u"]", re.UNICODE)
250
251 HAMZAT_PATTERN = re.compile(ur"["+u"".join(HAMZAT)+u"]", re.UNICODE)
252
253 ALEFAT_PATTERN = re.compile(ur"["+u"".join(ALEFAT)+u"]", re.UNICODE)
254
255 LIGUATURES_PATTERN = re.compile(ur"["+u"".join(LIGUATURES)+u"]", re.UNICODE)
256
257 TOKEN_PATTERN = re.compile(ur"([\w%s]+)" % u"".join(TASHKEEL), re.UNICODE)
258 TOKEN_REPLACE = re.compile('\t|\r|\f|\v| ')
259
260
261
262
264 """Checks for Arabic Sukun Mark.
265 @param archar: arabic unicode char
266 @type archar: unicode
267 @return:
268 @rtype:Boolean
269 """
270 return archar == SUKUN
271
273 """Checks for Arabic Shadda Mark.
274 @param archar: arabic unicode char
275 @type archar: unicode
276 @return:
277 @rtype:Boolean
278 """
279 return archar == SHADDA
280
282 """Checks for Arabic Tatweel letter modifier.
283 @param archar: arabic unicode char
284 @type archar: unicode
285 @return:
286 @rtype:Boolean
287 """
288 return archar == TATWEEL
289
291 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN).
292 @param archar: arabic unicode char
293 @type archar: unicode
294 @return:
295 @rtype:Boolean
296 """
297 return archar in TANWIN
298
300 """Checks for Arabic Tashkeel Marks (
301 - FATHA, DAMMA, KASRA, SUKUN,
302 - SHADDA,
303 - FATHATAN, DAMMATAN, KASRATAn).
304 @param archar: arabic unicode char
305 @type archar: unicode
306 @return:
307 @rtype:Boolean
308 """
309 return archar in TASHKEEL
310
312 """Checks for Arabic Harakat Marks (FATHA, DAMMA, KASRA, SUKUN, TANWIN).
313 @param archar: arabic unicode char
314 @type archar: unicode
315 @return:
316 @rtype:Boolean
317 """
318 return archar in HARAKAT
319
321 """Checks for Arabic short Harakat Marks (FATHA, DAMMA, KASRA, SUKUN).
322 @param archar: arabic unicode char
323 @type archar: unicode
324 @return:
325 @rtype:Boolean
326 """
327 return archar in SHORTHARAKAT
328
330 """Checks for Arabic Ligatures like LamAlef.
331 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
332 @param archar: arabic unicode char
333 @type archar: unicode
334 @return:
335 @rtype:Boolean
336 """
337 return archar in LIGUATURES
338
340 """Checks for Arabic Hamza forms.
341 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW, ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE )
342 @param archar: arabic unicode char
343 @type archar: unicode
344 @return:
345 @rtype:Boolean
346 """
347 return archar in HAMZAT
348
350 """Checks for Arabic Alef forms.
351 ALEFAT = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA )
352 @param archar: arabic unicode char
353 @type archar: unicode
354 @return:
355 @rtype:Boolean
356 """
357 return archar in ALEFAT
358
360 """Checks for Arabic Yeh forms.
361 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
362 @param archar: arabic unicode char
363 @type archar: unicode
364 @return:
365 @rtype:Boolean
366 """
367 return archar in YEHLIKE
368
370 """Checks for Arabic Waw like forms.
371 Waw forms : WAW, WAW_HAMZA, SMALL_WAW
372 @param archar: arabic unicode char
373 @type archar: unicode
374 @return:
375 @rtype:Boolean
376 """
377 return archar in WAWLIKE
378
380 """Checks for Arabic Teh forms.
381 Teh forms : TEH, TEH_MARBUTA
382 @param archar: arabic unicode char
383 @type archar: unicode
384 @return:
385 @rtype:Boolean
386 """
387 return archar in TEHLIKE
388
390 """Checks for Arabic Small letters.
391 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
392 @param archar: arabic unicode char
393 @type archar: unicode
394 @return:
395 @rtype:Boolean
396 """
397 return archar in SMALL
398
400 """Checks for Arabic Weak letters.
401 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
402 @param archar: arabic unicode char
403 @type archar: unicode
404 @return:
405 @rtype:Boolean
406 """
407 return archar in WEAK
408
410 """Checks for Arabic Moon letters.
411 Moon Letters :
412 @param archar: arabic unicode char
413 @type archar: unicode
414 @return:
415 @rtype:Boolean
416 """
417 return archar in MOON
418
420 """Checks for Arabic Sun letters.
421 Moon Letters :
422 @param archar: arabic unicode char
423 @type archar: unicode
424 @return:
425 @rtype:Boolean
426 """
427 return archar in SUN
428
429
430
432 """return Arabic letter order between 1 and 29.
433 Alef order is 1, Yeh is 28, Hamza is 29.
434 Teh Marbuta has the same ordre with Teh, 3.
435 @param archar: arabic unicode char
436 @type archar: unicode
437 @return: arabic order.
438 @rtype: integer
439 """
440 return ALPHABETIC_ORDER.get(archar, 0)
441
443 """return Arabic letter name in arabic. Alef order is 1, Yeh is 28,
444 Hamza is 29. Teh Marbuta has the same ordre with Teh, 3.
445 @param archar: arabic unicode char
446 @type archar: unicode
447 @return: arabic name.
448 @rtype: unicode
449 """
450 return NAMES.get(archar, u'')
451
453 u"""return a list of arabic characteres .
454 Return a list of characteres between \u060c to \u0652
455 @return: list of arabic characteres.
456 @rtype: unicode
457 """
458 mylist = []
459 for i in range(0x0600, 0x00653):
460 try :
461 mylist.append(unichr(i))
462 except ValueError:
463 pass
464 return mylist
465
466
467
469 """Checks if the arabic word contains shadda.
470 @param word: arabic unicode char
471 @type word: unicode
472 @return: if shadda exists
473 @rtype:Boolean
474 """
475 if re.search(SHADDA, word):
476 return True
477 return False
478
479
480
482 """Checks if the arabic word is vocalized.
483 the word musn't have any spaces and pounctuations.
484 @param word: arabic unicode char
485 @type word: unicode
486 @return: if the word is vocalized
487 @rtype:Boolean
488 """
489 if word.isalpha():
490 return False
491 for char in word:
492 if is_tashkeel(char):
493 return True
494 else:
495 return False
496
498 """Checks if the arabic text is vocalized.
499 The text can contain many words and spaces
500 @param text: arabic unicode char
501 @type text: unicode
502 @return: if the word is vocalized
503 @rtype:Boolean
504 """
505 if re.search(HARAKAT_PATTERN, text):
506 return True
507 else:
508 return False
510 """ Checks for an Arabic standard Unicode block characters
511 An arabic string can contain spaces, digits and pounctuation.
512 but only arabic standard characters, not extended arabic
513 @param text: input text
514 @type text: unicode
515 @return: True if all charaters are in Arabic block
516 @rtype: Boolean
517 """
518 if re.search(ur"([^\u0600-\u0652%s%s%s\s\d])"\
519 %(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE), text):
520 return False
521 return True
522
524 """ Checks for an Arabic Unicode block characters
525 @param text: input text
526 @type text: unicode
527 @return: True if all charaters are in Arabic block
528 @rtype: Boolean
529 """
530 if re.search(u"([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])", \
531 text):
532 return False
533 return True
534
536 """ Checks for an valid Arabic word.
537 An Arabic word not contains spaces, digits and pounctuation
538 avoid some spelling error, TEH_MARBUTA must be at the end.
539 @param word: input word
540 @type word: unicode
541 @return: True if all charaters are in Arabic block
542 @rtype: Boolean
543 """
544 if len(word) == 0 :
545 return False
546 elif re.search(u"([^\u0600-\u0652%s%s%s])"\
547 %(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE), word):
548 return False
549 elif is_haraka(word[0]) or word[0] in (WAW_HAMZA, YEH_HAMZA):
550 return False
551
552 elif re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA, word):
553 return False
554 elif re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%\
555 (TEH_MARBUTA, DAMMA, KASRA, FATHA), word):
556 return False
557 else:
558 return True
559
560
561
563 """
564 Return the first char
565 @param word: given word
566 @type word: unicode
567 @return: the first char
568 @rtype: unicode char
569 """
570 return word[0]
572 """
573 Return the second char
574 @param word: given word
575 @type word: unicode
576 @return: the first char
577 @rtype: unicode char
578 """
579 return word[1:2]
581 """
582 Return the last letter
583 example: zerrouki; 'i' is the last.
584 @param word: given word
585 @type word: unicode
586 @return: the last letter
587 @rtype: unicode char
588 """
589 return word[-1:]
591 """
592 Return the second last letter example: zerrouki; 'k' is the second last.
593 @param word: given word
594 @type word: unicode
595 @return: the second last letter
596 @rtype: unicode char
597 """
598 return word[-2:-1]
599
600
601
603 """Strip Harakat from arabic word except Shadda.
604 The striped marks are :
605 - FATHA, DAMMA, KASRA
606 - SUKUN
607 - FATHATAN, DAMMATAN, KASRATAN, , , .
608 Example:
609 >>> text = u"الْعَرَبِيّةُ"
610 >>> stripTashkeel(text)
611 >>> العربيّة
612 @param text: arabic text.
613 @type text: unicode.
614 @return: return a striped text.
615 @rtype: unicode.
616 """
617
618
619
620 if not text:
621 return text
622 elif is_vocalized(text):
623 for char in HARAKAT:
624 text = text.replace(char, '')
625 return text
627 """Strip the last Haraka from arabic word except Shadda.
628 The striped marks are :
629 - FATHA, DAMMA, KASRA
630 - SUKUN
631 - FATHATAN, DAMMATAN, KASRATAN
632 Example:
633 >>> text = u"الْعَرَبِيّةُ"
634 >>> stripTashkeel(text)
635 >>> الْعَرَبِيّة
636 @param text: arabic text.
637 @type text: unicode.
638 @return: return a striped text.
639 @rtype: unicode.
640 """
641 if text:
642 if is_vocalized(text):
643 return re.sub(LASTHARAKA_PATTERN, u'', text)
644 return text
645
647 """Strip vowels from a text, include Shadda.
648 The striped marks are :
649 - FATHA, DAMMA, KASRA
650 - SUKUN
651 - SHADDA
652 - FATHATAN, DAMMATAN, KASRATAN, , , .
653 Example:
654 >>> text = u"الْعَرَبِيّةُ"
655 >>> stripTashkeel(text)
656 العربية
657 @param text: arabic text.
658 @type text: unicode.
659 @return: return a striped text.
660 @rtype: unicode.
661 """
662 if not text:
663 return text
664 elif is_vocalized(text):
665 for char in TASHKEEL:
666 text = text.replace(char, '')
667 return text
669 """
670 Strip tatweel from a text and return a result text.
671 Example:
672 >>> text = u"العـــــربية"
673 >>> stripTatweel(text)
674 >>> العربية
675 @param text: arabic text.
676 @type text: unicode.
677 @return: return a striped text.
678 @rtype: unicode.
679 """
680 return text.replace(TATWEEL, '')
681
683 """
684 Strip Shadda from a text and return a result text.
685
686 Example:
687 >>> text = u"الشّمسيّة"
688 >>> stripTatweel(text)
689 الشمسية
690
691 @param text: arabic text.
692 @type text: unicode.
693 @return: return a striped text.
694 @rtype: unicode.
695 """
696 return text.replace(SHADDA, '')
697
699 """Normalize Lam Alef ligatures into two letters (LAM and ALEF),
700 and Tand return a result text.
701 Some systems present lamAlef ligature as a single letter,
702 this function convert it into two letters,
703 The converted letters into LAM and ALEF are :
704 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
705 Example:
706 >>> text = u"لانها لالء الاسلام"
707 >>> normalizeLigature(text)
708 لانها لالئ الاسلام
709
710 @param text: arabic text.
711 @type text: unicode.
712 @return: return a converted text.
713 @rtype: unicode.
714 """
715 if text:
716 return LIGUATURES_PATTERN.sub(u'%s%s'%(LAM, ALEF), text)
717 return text
718
720 """Standardize the Hamzat into one form of hamza,
721 replace Madda by hamza and alef.
722 Replace the LamAlefs by simplified letters.
723 Example:
724 >>> text = u"سئل أحد الأئمة"
725 >>> normalizeHamza(text)
726 سءل ءحد الءءمة
727
728 @param word: arabic text.
729 @type word: unicode.
730 @return: return a converted text.
731 @rtype: unicode.
732 """
733 if word.startswith(ALEF_MADDA):
734 if len(word)>= 3 and (word[1] not in HARAKAT) and \
735 (word[2] == SHADDA or len(word) == 3):
736 word = HAMZA + ALEF + word[1:]
737 else:
738 word = HAMZA + HAMZA + word[1:]
739
740 word = word.replace(ALEF_MADDA, HAMZA+HAMZA)
741 word = HAMZAT_PATTERN.sub(HAMZA, word)
742 return word
743
744 -def separate(word, extract_shadda = False):
745 """
746 separate the letters from the vowels, in arabic word,
747 if a letter hasn't a haraka, the not definited haraka is attributed.
748 return ( letters, vowels)
749 @param word: the input word
750 @type word: unicode
751 @param extract_shadda: extract shadda as seperate text
752 @type extract_shadda: Boolean
753 @return: ( letters, vowels)
754 @rtype:couple of unicode
755 """
756 stack1 = stack.Stack(word)
757
758 stack1.items.reverse()
759 letters = stack.Stack()
760 marks = stack.Stack()
761 vowels = HARAKAT
762 last1 = stack1.pop()
763
764
765
766 while last1 in vowels:
767 last1 = stack1.pop()
768 while last1 != None:
769 if last1 in vowels:
770
771
772 marks.pop()
773 marks.push(last1)
774 elif last1 == SHADDA:
775
776
777
778 marks.pop()
779 marks.push(SUKUN)
780 marks.push(NOT_DEF_HARAKA)
781 letters.push(SHADDA)
782 else:
783 marks.push(NOT_DEF_HARAKA)
784 letters.push(last1)
785 last1 = stack1.pop()
786 if extract_shadda:
787
788 wordletters = u''.join(letters.items)
789
790 shaddaplaces = re.sub(ur'[^%s]'%SHADDA, TATWEEL, wordletters)
791 shaddaplaces = re.sub(u'%s%s'%(TATWEEL, SHADDA), SHADDA, shaddaplaces)
792
793 wordletters = strip_shadda(wordletters)
794
795 return (wordletters, u''.join(marks.items), shaddaplaces)
796 else:
797 return (u''.join(letters.items), u''.join(marks.items))
798
799
800 -def joint(letters, marks):
801 """ joint the letters with the marks
802 the length ot letters and marks must be equal
803 return word
804 @param letters: the word letters
805 @type letters: unicode
806 @param marks: the word marks
807 @type marks: unicode
808 @return: word
809 @rtype: unicode
810 """
811
812 if len(letters) != len(marks):
813 return ""
814 stack_letter = stack.Stack(letters)
815 stack_letter.items.reverse()
816 stack_mark = stack.Stack(marks)
817 stack_mark.items.reverse()
818
819 word_stack = stack.Stack()
820 last_letter = stack_letter.pop()
821 last_mark = stack_mark.pop()
822 vowels = HARAKAT
823 while last_letter != None and last_mark != None:
824 if last_letter == SHADDA:
825 top = word_stack.pop()
826 if top not in vowels:
827 word_stack.push(top)
828 word_stack.push(last_letter)
829 if last_mark != NOT_DEF_HARAKA:
830 word_stack.push(last_mark)
831 else:
832 word_stack.push(last_letter)
833 if last_mark != NOT_DEF_HARAKA:
834 word_stack.push(last_mark)
835
836 last_letter = stack_letter.pop()
837 last_mark = stack_mark.pop()
838
839 if not (stack_letter.is_empty() and stack_mark.is_empty()):
840 return False
841 else:
842 return ''.join(word_stack.items)
843
844
846 """
847 if the two words has the same letters and the same harakats, this fuction return True.
848 The two words can be full vocalized, or partial vocalized
849 @param word1: first word
850 @type word1: unicode
851 @param word2: second word
852 @type word2: unicode
853 @return: if two words have similar vocalization
854 @rtype: Boolean
855 """
856 if vocalized_similarity(word1, word2)<0:
857 return False
858 else: return True
859
860
861
862
864 """if the word1 is like a wazn (pattern),
865 the letters must be equal,
866 the wazn has FEH, AIN, LAM letters.
867 this are as generic letters.
868 The two words can be full vocalized, or partial vocalized
869 @param word1: input word
870 @type word1: unicode
871 @param wazn: given word template وزن
872 @type wazn: unicode
873 @return: if two words have similar vocalization
874 @rtype: Boolean
875 """
876 stack1 = stack.Stack(word1)
877 stack2 = stack.Stack(wazn)
878 root = stack.Stack()
879 last1 = stack1.pop()
880 last2 = stack2.pop()
881 vowels = HARAKAT
882 while last1 != None and last2 != None:
883 if last1 == last2 and last2 not in (FEH, AIN, LAM):
884 last1 = stack1.pop()
885 last2 = stack2.pop()
886 elif last1 not in vowels and last2 in (FEH, AIN, LAM):
887 root.push(last1)
888 print "t"
889 last1 = stack1.pop()
890 last2 = stack2.pop()
891 elif last1 in vowels and last2 not in vowels:
892 last1 = stack1.pop()
893 elif last1 not in vowels and last2 in vowels:
894 last2 = stack2.pop()
895 else:
896 break
897
898 root.items.reverse()
899 print " the root is ", root.items
900 if not (stack1.is_empty() and stack2.is_empty()):
901 return False
902 else: return True
903
905 """
906 If the two words has the same letters and the same harakats, this fuction return True.
907 The first word is partially vocalized, the second is fully
908 if the partially contians a shadda, it must be at the same place in the fully
909 @return: if contains shadda
910 @rtype: Boolean
911 """
912
913 if not has_shadda(partial):
914 return True
915
916 elif not has_shadda(fully) and has_shadda(partial):
917 return False
918
919 partial = strip_harakat(partial)
920 fully = strip_harakat(fully)
921 pstack = stack.Stack(partial)
922 vstack = stack.Stack(fully)
923 plast = pstack.pop()
924 vlast = vstack.pop()
925
926 while plast != None and vlast != None:
927 if plast == vlast:
928 plast = pstack.pop()
929 vlast = vstack.pop()
930 elif plast == SHADDA and vlast != SHADDA:
931
932 break
933 elif plast != SHADDA and vlast == SHADDA:
934
935 vlast = vstack.pop()
936 else:
937
938 break
939 if not (pstack.is_empty() and vstack.is_empty()):
940 return False
941 else: return True
942
944 """Reduce the Tashkeel, by deleting evident cases.
945 @param text: the input text fully vocalized.
946 @type text: unicode.
947 @return : partially vocalized text.
948 @rtype: unicode.
949 """
950 patterns = [
951
952 u"(?<!(%s|%s))(%s|%s)" % (WAW, YEH, SUKUN, FATHA),
953
954 u"%s(?=%s)" % (DAMMA, WAW),
955
956 u"%s(?=%s)" % (KASRA, YEH),
957
958
959 u"%s(?=%s)" % (FATHA, ALEF),
960
961 ur"(?<=\s(%s|%s))%s" % (WAW, YEH, FATHA),
962
963 u"(?<=%s)%s" % (ALEF_HAMZA_BELOW, KASRA),
964 ]
965 reduced = text
966 for pat in patterns:
967 reduced = re.sub(pat, '', reduced)
968 return reduced
969
971 """
972 if the two words has the same letters and the same harakats, this function return True.
973 The two words can be full vocalized, or partial vocalized
974 @param word1: first word
975 @type word1: unicode
976 @param word2: second word
977 @type word2: unicode
978 @return: return if words are similar, else return negative number of errors
979 @rtype: Boolean / int
980 """
981
982 stack1 = stack.Stack(word1)
983 stack2 = stack.Stack(word2)
984 last1 = stack1.pop()
985 last2 = stack2.pop()
986 err_count = 0
987 vowels = HARAKAT
988 while last1 != None and last2 != None:
989 if last1 == last2:
990 last1 = stack1.pop()
991 last2 = stack2.pop()
992 elif last1 in vowels and last2 not in vowels:
993 last1 = stack1.pop()
994 elif last1 not in vowels and last2 in vowels:
995 last2 = stack2.pop()
996 else:
997
998 if last1 == SHADDA:
999 last1 = stack1.pop()
1000 elif last2 == SHADDA:
1001 last2 = stack2.pop()
1002 else:
1003 last1 = stack1.pop()
1004 last2 = stack2.pop()
1005 err_count += 1
1006 if err_count > 0 :
1007 return -err_count
1008 else: return True
1009
1011 """
1012 Tokenize text into words
1013 @param text: the input text.
1014 @type text: unicode.
1015 @return: list of words.
1016 @rtype: list.
1017 """
1018 if text == u'':
1019 return []
1020 else:
1021
1022 mylist = TOKEN_PATTERN.split(text)
1023
1024 mylist = [TOKEN_REPLACE.sub('',x) for x in mylist if x]
1025
1026 mylist = [x for x in mylist if x]
1027 return mylist
1028
1029 if __name__ == "__main__":
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039 print "like: ", vocalizedlike(u'مُتَوَهِّمًا', u'متوهمًا')
1040 print "sim: ", vocalized_similarity(u'ثمّ', u'ثُمَّ')
1041 print "like: ", vocalizedlike(u'ثمّ', u'ثُمَّ')
1042 print "sim: ", vocalized_similarity(u'ثم', u'ثُمَّ')
1043 print "like: ", vocalizedlike(u'ثم', u'ثُمَّ')
1044 print "sim: ", vocalized_similarity(u'مُتَوَهِّمًا', u'متوهمًا')
1045 print "sim: ", vocalized_similarity(u'مُتَوَهِّمًا', u'متوهمًا')
1046