Package pyarabic :: Module araby
[hide private]
[frames] | no frames]

Source Code for Module pyarabic.araby

   1  #!/usr/bin/python 
   2  # -*- coding=utf-8 -*- 
   3  """ 
   4  Arabic module 
   5  @author: Taha Zerrouki 
   6  @contact: taha dot zerrouki at gmail dot com 
   7  @copyright: Arabtechies,  Arabeyes,   Taha Zerrouki 
   8  @license: GPL 
   9  @date:2010/03/01 
  10  @version: 0.1 
  11  """ 
  12  import re 
  13   
  14  if __name__ == "__main__": 
  15      import stack 
  16  else: 
  17      import pyarabic.stack as stack 
  18   
  19  COMMA            = u'\u060C' 
  20  SEMICOLON        = u'\u061B' 
  21  QUESTION         = u'\u061F' 
  22  HAMZA            = u'\u0621' 
  23  ALEF_MADDA       = u'\u0622' 
  24  ALEF_HAMZA_ABOVE = u'\u0623' 
  25  WAW_HAMZA        = u'\u0624' 
  26  ALEF_HAMZA_BELOW = u'\u0625' 
  27  YEH_HAMZA        = u'\u0626' 
  28  ALEF             = u'\u0627' 
  29  BEH              = u'\u0628' 
  30  TEH_MARBUTA      = u'\u0629' 
  31  TEH              = u'\u062a' 
  32  THEH             = u'\u062b' 
  33  JEEM             = u'\u062c' 
  34  HAH              = u'\u062d' 
  35  KHAH             = u'\u062e' 
  36  DAL              = u'\u062f' 
  37  THAL             = u'\u0630' 
  38  REH              = u'\u0631' 
  39  ZAIN             = u'\u0632' 
  40  SEEN             = u'\u0633' 
  41  SHEEN            = u'\u0634' 
  42  SAD              = u'\u0635' 
  43  DAD              = u'\u0636' 
  44  TAH              = u'\u0637' 
  45  ZAH              = u'\u0638' 
  46  AIN              = u'\u0639' 
  47  GHAIN            = u'\u063a' 
  48  TATWEEL          = u'\u0640' 
  49  FEH              = u'\u0641' 
  50  QAF              = u'\u0642' 
  51  KAF              = u'\u0643' 
  52  LAM              = u'\u0644' 
  53  MEEM             = u'\u0645' 
  54  NOON             = u'\u0646' 
  55  HEH              = u'\u0647' 
  56  WAW              = u'\u0648' 
  57  ALEF_MAKSURA     = u'\u0649' 
  58  YEH              = u'\u064a' 
  59  MADDA_ABOVE      = u'\u0653' 
  60  HAMZA_ABOVE      = u'\u0654' 
  61  HAMZA_BELOW      = u'\u0655' 
  62  ZERO             = u'\u0660' 
  63  ONE              = u'\u0661' 
  64  TWO              = u'\u0662' 
  65  THREE            = u'\u0663' 
  66  FOUR             = u'\u0664' 
  67  FIVE             = u'\u0665' 
  68  SIX              = u'\u0666' 
  69  SEVEN            = u'\u0667' 
  70  EIGHT            = u'\u0668' 
  71  NINE             = u'\u0669' 
  72  PERCENT          = u'\u066a' 
  73  DECIMAL          = u'\u066b' 
  74  THOUSANDS        = u'\u066c' 
  75  STAR             = u'\u066d' 
  76  MINI_ALEF        = u'\u0670' 
  77  ALEF_WASLA       = u'\u0671' 
  78  FULL_STOP        = u'\u06d4' 
  79  BYTE_ORDER_MARK  = u'\ufeff' 
  80   
  81  # Diacritics 
  82  FATHATAN         = u'\u064b' 
  83  DAMMATAN         = u'\u064c' 
  84  KASRATAN         = u'\u064d' 
  85  FATHA            = u'\u064e' 
  86  DAMMA            = u'\u064f' 
  87  KASRA            = u'\u0650' 
  88  SHADDA           = u'\u0651' 
  89  SUKUN            = u'\u0652' 
  90   
  91  # Small Letters 
  92  SMALL_ALEF       = u"\u0670" 
  93  SMALL_WAW        = u"\u06E5" 
  94  SMALL_YEH        = u"\u06E6" 
  95  #Ligatures 
  96  LAM_ALEF                     = u'\ufefb' 
  97  LAM_ALEF_HAMZA_ABOVE         = u'\ufef7' 
  98  LAM_ALEF_HAMZA_BELOW         = u'\ufef9' 
  99  LAM_ALEF_MADDA_ABOVE         = u'\ufef5' 
 100  SIMPLE_LAM_ALEF              = u'\u0644\u0627' 
 101  SIMPLE_LAM_ALEF_HAMZA_ABOVE  = u'\u0644\u0623' 
 102  SIMPLE_LAM_ALEF_HAMZA_BELOW  = u'\u0644\u0625' 
 103  SIMPLE_LAM_ALEF_MADDA_ABOVE  = u'\u0644\u0622' 
 104  # groups 
 105  LETTERS = u''.join([ 
 106          ALEF,  BEH,  TEH,  TEH_MARBUTA,  THEH,  JEEM,  HAH,  KHAH,  
 107          DAL, THAL, REH,  ZAIN,  SEEN,  SHEEN,  SAD,  DAD,  TAH,  ZAH,  
 108          AIN,  GHAIN,  FEH,  QAF,  KAF,  LAM,  MEEM,  NOON,  HEH,  WAW,  YEH,  
 109          HAMZA,   ALEF_MADDA,  ALEF_HAMZA_ABOVE,  WAW_HAMZA,  ALEF_HAMZA_BELOW, 
 110          YEH_HAMZA,  
 111          ]) 
 112   
 113  TASHKEEL  = (FATHATAN,  DAMMATAN,  KASRATAN,  
 114              FATHA, DAMMA, KASRA,  
 115              SUKUN,  
 116              SHADDA) 
 117  HARAKAT  = ( FATHATAN,    DAMMATAN,    KASRATAN,  
 118              FATHA,   DAMMA,   KASRA,  
 119              SUKUN 
 120              ) 
 121  SHORTHARAKAT  = ( FATHA,   DAMMA,   KASRA,  SUKUN) 
 122   
 123  TANWIN  = (FATHATAN,   DAMMATAN,    KASRATAN) 
 124   
 125  NOT_DEF_HARAKA = TATWEEL 
 126  LIGUATURES = ( 
 127              LAM_ALEF,  
 128              LAM_ALEF_HAMZA_ABOVE,  
 129              LAM_ALEF_HAMZA_BELOW,  
 130              LAM_ALEF_MADDA_ABOVE,  
 131              ) 
 132  HAMZAT = (  HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,  
 133              ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE,  
 134              ) 
 135  ALEFAT = ( ALEF, ALEF_MADDA,  ALEF_HAMZA_ABOVE,  
 136              ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA, SMALL_ALEF,  
 137          ) 
 138  WEAK   = ( ALEF,  WAW,  YEH,  ALEF_MAKSURA) 
 139  YEHLIKE =  ( YEH,   YEH_HAMZA,   ALEF_MAKSURA,    SMALL_YEH  ) 
 140   
 141  WAWLIKE   = ( WAW,   WAW_HAMZA,   SMALL_WAW ) 
 142  TEHLIKE   = ( TEH,   TEH_MARBUTA ) 
 143   
 144  SMALL   = ( SMALL_ALEF,  SMALL_WAW,  SMALL_YEH) 
 145  MOON = (HAMZA, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, 
 146                  ALEF, BEH, JEEM, HAH, KHAH, AIN, GHAIN,  
 147                  FEH, QAF, KAF, MEEM, HEH, WAW, YEH) 
 148   
 149  SUN = ( TEH, THEH, DAL, THAL, REH, ZAIN, SEEN, SHEEN, 
 150                  SAD, DAD, TAH, ZAH, LAM, NOON, ) 
 151   
 152  ALPHABETIC_ORDER = { 
 153                  ALEF             : 1,  
 154                  BEH              : 2,  
 155                  TEH              : 3,  
 156                  TEH_MARBUTA      : 3,  
 157                  THEH             : 4,  
 158                  JEEM             : 5,  
 159                  HAH              : 6,  
 160                  KHAH             : 7,  
 161                  DAL              : 8,  
 162                  THAL             : 9,  
 163                  REH              : 10,  
 164                  ZAIN             : 11,  
 165                  SEEN             : 12,  
 166                  SHEEN            : 13,  
 167                  SAD              : 14,  
 168                  DAD              : 15,  
 169                  TAH              : 16,  
 170                  ZAH              : 17,  
 171                  AIN              : 18,  
 172                  GHAIN            : 19,  
 173                  FEH              : 20,  
 174                  QAF              : 21,  
 175                  KAF              : 22,  
 176                  LAM              : 23,  
 177                  MEEM             : 24,  
 178                  NOON             : 25,  
 179                  HEH              : 26,  
 180                  WAW              : 27,  
 181                  YEH              : 28,  
 182                  HAMZA            : 29,  
 183   
 184                  ALEF_MADDA       : 29,  
 185                  ALEF_HAMZA_ABOVE : 29,  
 186                  WAW_HAMZA        : 29,  
 187                  ALEF_HAMZA_BELOW : 29,  
 188                  YEH_HAMZA        : 29,  
 189                  } 
 190  NAMES  = { 
 191                  ALEF             :  u"ألف",  
 192                  BEH              : u"باء",  
 193                  TEH              : u'تاء' ,  
 194                  TEH_MARBUTA      : u'تاء مربوطة' ,  
 195                  THEH             : u'ثاء' ,  
 196                  JEEM             : u'جيم' ,  
 197                  HAH              : u'حاء' ,  
 198                  KHAH             : u'خاء' ,  
 199                  DAL              : u'دال' ,  
 200                  THAL             : u'ذال' ,  
 201                  REH              : u'راء' ,  
 202                  ZAIN             : u'زاي' ,  
 203                  SEEN             : u'سين' ,  
 204                  SHEEN            : u'شين' ,  
 205                  SAD              : u'صاد' ,  
 206                  DAD              : u'ضاد' ,  
 207                  TAH              : u'طاء' ,  
 208                  ZAH              : u'ظاء' ,  
 209                  AIN              : u'عين' ,  
 210                  GHAIN            : u'غين' ,  
 211                  FEH              : u'فاء' ,  
 212                  QAF              : u'قاف' ,  
 213                  KAF              : u'كاف' ,  
 214                  LAM              : u'لام' ,  
 215                  MEEM             : u'ميم' ,  
 216                  NOON             : u'نون' ,  
 217                  HEH              : u'هاء' ,  
 218                  WAW              : u'واو' ,  
 219                  YEH              : u'ياء' ,  
 220                  HAMZA            : u'همزة' ,  
 221   
 222                  TATWEEL          : u'تطويل' ,  
 223                  ALEF_MADDA       : u'ألف ممدودة' ,  
 224                  ALEF_MAKSURA      : u'ألف مقصورة' ,  
 225                  ALEF_HAMZA_ABOVE : u'همزة على الألف' ,  
 226                  WAW_HAMZA        : u'همزة على الواو' ,  
 227                  ALEF_HAMZA_BELOW : u'همزة تحت الألف' ,  
 228                  YEH_HAMZA        : u'همزة على الياء' ,  
 229                  FATHATAN         : u'فتحتان',  
 230                  DAMMATAN         : u'ضمتان',  
 231                  KASRATAN         : u'كسرتان',  
 232                  FATHA            : u'فتحة',  
 233                  DAMMA            : u'ضمة',  
 234                  KASRA            : u'كسرة',  
 235                  SHADDA           : u'شدة',  
 236                  SUKUN            : u'سكون',  
 237                  } 
 238  # regular expretion 
 239   
 240  HARAKAT_PATTERN  = re.compile(ur"["+u"".join(HARAKAT)+u"]",  re.UNICODE) 
 241  #~ """ pattern to strip Harakat""" 
 242  LASTHARAKA_PATTERN  = \ 
 243          re.compile(ur"[%s]$|[%s]"%(u"".join(HARAKAT), u''.join(TANWIN)), re.UNICODE) 
 244  #~ """ Pattern to strip only the last haraka """ 
 245  SHORTHARAKAT_PATTERN  = \ 
 246          re.compile(ur"["+u"".join(SHORTHARAKAT)+u"]",  re.UNICODE) 
 247  #~ Pattern to lookup Short Harakat(Fatha, Damma, Kasra, sukun, tanwin), 
 248  # but not shadda 
 249  TASHKEEL_PATTERN  = re.compile(ur"["+u"".join(TASHKEEL)+u"]",  re.UNICODE) 
 250  #~ """ Harakat and shadda pattern  """ 
 251  HAMZAT_PATTERN  = re.compile(ur"["+u"".join(HAMZAT)+u"]",  re.UNICODE) 
 252  #~ """ all hamzat pattern""" 
 253  ALEFAT_PATTERN  = re.compile(ur"["+u"".join(ALEFAT)+u"]",  re.UNICODE) 
 254  #~ """ all alef like letters """ 
 255  LIGUATURES_PATTERN  = re.compile(ur"["+u"".join(LIGUATURES)+u"]",  re.UNICODE) 
 256  #~ """ all liguatures pattern """ 
 257  TOKEN_PATTERN =  re.compile(ur"([\w%s]+)" % u"".join(TASHKEEL), re.UNICODE) 
 258  TOKEN_REPLACE = re.compile('\t|\r|\f|\v| ') 
 259  #~ """ pattern to tokenize a text""" 
 260  ################################################ 
 261  #{ is letter functions 
 262  ################################################ 
263 -def is_sukun(archar):
264 """Checks for Arabic Sukun Mark. 265 @param archar: arabic unicode char 266 @type archar: unicode 267 @return: 268 @rtype:Boolean 269 """ 270 return archar == SUKUN
271
272 -def is_shadda(archar):
273 """Checks for Arabic Shadda Mark. 274 @param archar: arabic unicode char 275 @type archar: unicode 276 @return: 277 @rtype:Boolean 278 """ 279 return archar == SHADDA
280
281 -def is_tatweel(archar):
282 """Checks for Arabic Tatweel letter modifier. 283 @param archar: arabic unicode char 284 @type archar: unicode 285 @return: 286 @rtype:Boolean 287 """ 288 return archar == TATWEEL
289
290 -def is_tanwin(archar):
291 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN). 292 @param archar: arabic unicode char 293 @type archar: unicode 294 @return: 295 @rtype:Boolean 296 """ 297 return archar in TANWIN
298
299 -def is_tashkeel(archar):
300 """Checks for Arabic Tashkeel Marks ( 301 - FATHA, DAMMA, KASRA, SUKUN, 302 - SHADDA, 303 - FATHATAN, DAMMATAN, KASRATAn). 304 @param archar: arabic unicode char 305 @type archar: unicode 306 @return: 307 @rtype:Boolean 308 """ 309 return archar in TASHKEEL
310
311 -def is_haraka(archar):
312 """Checks for Arabic Harakat Marks (FATHA, DAMMA, KASRA, SUKUN, TANWIN). 313 @param archar: arabic unicode char 314 @type archar: unicode 315 @return: 316 @rtype:Boolean 317 """ 318 return archar in HARAKAT
319
320 -def is_shortharaka(archar):
321 """Checks for Arabic short Harakat Marks (FATHA, DAMMA, KASRA, SUKUN). 322 @param archar: arabic unicode char 323 @type archar: unicode 324 @return: 325 @rtype:Boolean 326 """ 327 return archar in SHORTHARAKAT
328
329 -def is_ligature(archar):
330 """Checks for Arabic Ligatures like LamAlef. 331 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) 332 @param archar: arabic unicode char 333 @type archar: unicode 334 @return: 335 @rtype:Boolean 336 """ 337 return archar in LIGUATURES
338
339 -def is_hamza(archar):
340 """Checks for Arabic Hamza forms. 341 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW, ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE ) 342 @param archar: arabic unicode char 343 @type archar: unicode 344 @return: 345 @rtype:Boolean 346 """ 347 return archar in HAMZAT
348
349 -def is_alef(archar):
350 """Checks for Arabic Alef forms. 351 ALEFAT = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA ) 352 @param archar: arabic unicode char 353 @type archar: unicode 354 @return: 355 @rtype:Boolean 356 """ 357 return archar in ALEFAT
358
359 -def is_yehlike(archar):
360 """Checks for Arabic Yeh forms. 361 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA 362 @param archar: arabic unicode char 363 @type archar: unicode 364 @return: 365 @rtype:Boolean 366 """ 367 return archar in YEHLIKE
368
369 -def is_wawlike(archar):
370 """Checks for Arabic Waw like forms. 371 Waw forms : WAW, WAW_HAMZA, SMALL_WAW 372 @param archar: arabic unicode char 373 @type archar: unicode 374 @return: 375 @rtype:Boolean 376 """ 377 return archar in WAWLIKE
378
379 -def is_teh(archar):
380 """Checks for Arabic Teh forms. 381 Teh forms : TEH, TEH_MARBUTA 382 @param archar: arabic unicode char 383 @type archar: unicode 384 @return: 385 @rtype:Boolean 386 """ 387 return archar in TEHLIKE
388
389 -def is_small(archar):
390 """Checks for Arabic Small letters. 391 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH 392 @param archar: arabic unicode char 393 @type archar: unicode 394 @return: 395 @rtype:Boolean 396 """ 397 return archar in SMALL
398
399 -def is_weak(archar):
400 """Checks for Arabic Weak letters. 401 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA 402 @param archar: arabic unicode char 403 @type archar: unicode 404 @return: 405 @rtype:Boolean 406 """ 407 return archar in WEAK
408
409 -def is_moon(archar):
410 """Checks for Arabic Moon letters. 411 Moon Letters : 412 @param archar: arabic unicode char 413 @type archar: unicode 414 @return: 415 @rtype:Boolean 416 """ 417 return archar in MOON
418
419 -def is_sun(archar):
420 """Checks for Arabic Sun letters. 421 Moon Letters : 422 @param archar: arabic unicode char 423 @type archar: unicode 424 @return: 425 @rtype:Boolean 426 """ 427 return archar in SUN
428 ##################################### 429 #{ general letter functions 430 #####################################
431 -def order(archar):
432 """return Arabic letter order between 1 and 29. 433 Alef order is 1, Yeh is 28, Hamza is 29. 434 Teh Marbuta has the same ordre with Teh, 3. 435 @param archar: arabic unicode char 436 @type archar: unicode 437 @return: arabic order. 438 @rtype: integer 439 """ 440 return ALPHABETIC_ORDER.get(archar, 0)
441
442 -def name(archar):
443 """return Arabic letter name in arabic. Alef order is 1, Yeh is 28, 444 Hamza is 29. Teh Marbuta has the same ordre with Teh, 3. 445 @param archar: arabic unicode char 446 @type archar: unicode 447 @return: arabic name. 448 @rtype: unicode 449 """ 450 return NAMES.get(archar, u'')
451
452 -def arabicrange():
453 u"""return a list of arabic characteres . 454 Return a list of characteres between \u060c to \u0652 455 @return: list of arabic characteres. 456 @rtype: unicode 457 """ 458 mylist = [] 459 for i in range(0x0600, 0x00653): 460 try : 461 mylist.append(unichr(i)) 462 except ValueError: 463 pass 464 return mylist
465 ##################################### 466 #{ Has letter functions 467 #####################################
468 -def has_shadda(word):
469 """Checks if the arabic word contains shadda. 470 @param word: arabic unicode char 471 @type word: unicode 472 @return: if shadda exists 473 @rtype:Boolean 474 """ 475 if re.search(SHADDA, word): 476 return True 477 return False
478 ##################################### 479 #{ word and text functions 480 #####################################
481 -def is_vocalized(word):
482 """Checks if the arabic word is vocalized. 483 the word musn't have any spaces and pounctuations. 484 @param word: arabic unicode char 485 @type word: unicode 486 @return: if the word is vocalized 487 @rtype:Boolean 488 """ 489 if word.isalpha(): 490 return False 491 for char in word: 492 if is_tashkeel(char): 493 return True 494 else: 495 return False
496
497 -def is_vocalizedtext(text):
498 """Checks if the arabic text is vocalized. 499 The text can contain many words and spaces 500 @param text: arabic unicode char 501 @type text: unicode 502 @return: if the word is vocalized 503 @rtype:Boolean 504 """ 505 if re.search(HARAKAT_PATTERN, text): 506 return True 507 else: 508 return False
509 -def is_arabicstring(text):
510 """ Checks for an Arabic standard Unicode block characters 511 An arabic string can contain spaces, digits and pounctuation. 512 but only arabic standard characters, not extended arabic 513 @param text: input text 514 @type text: unicode 515 @return: True if all charaters are in Arabic block 516 @rtype: Boolean 517 """ 518 if re.search(ur"([^\u0600-\u0652%s%s%s\s\d])"\ 519 %(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE), text): 520 return False 521 return True
522
523 -def is_arabicrange(text):
524 """ Checks for an Arabic Unicode block characters 525 @param text: input text 526 @type text: unicode 527 @return: True if all charaters are in Arabic block 528 @rtype: Boolean 529 """ 530 if re.search(u"([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])", \ 531 text): 532 return False 533 return True
534
535 -def is_arabicword(word):
536 """ Checks for an valid Arabic word. 537 An Arabic word not contains spaces, digits and pounctuation 538 avoid some spelling error, TEH_MARBUTA must be at the end. 539 @param word: input word 540 @type word: unicode 541 @return: True if all charaters are in Arabic block 542 @rtype: Boolean 543 """ 544 if len(word) == 0 : 545 return False 546 elif re.search(u"([^\u0600-\u0652%s%s%s])"\ 547 %(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE), word): 548 return False 549 elif is_haraka(word[0]) or word[0] in (WAW_HAMZA, YEH_HAMZA): 550 return False 551 # if Teh Marbuta or Alef_Maksura not in the end 552 elif re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA, word): 553 return False 554 elif re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%\ 555 (TEH_MARBUTA, DAMMA, KASRA, FATHA), word): 556 return False 557 else: 558 return True
559 ##################################### 560 #{Char functions 561 #####################################
562 -def first_char(word):
563 """ 564 Return the first char 565 @param word: given word 566 @type word: unicode 567 @return: the first char 568 @rtype: unicode char 569 """ 570 return word[0]
571 -def second_char(word):
572 """ 573 Return the second char 574 @param word: given word 575 @type word: unicode 576 @return: the first char 577 @rtype: unicode char 578 """ 579 return word[1:2]
580 -def last_char(word):
581 """ 582 Return the last letter 583 example: zerrouki; 'i' is the last. 584 @param word: given word 585 @type word: unicode 586 @return: the last letter 587 @rtype: unicode char 588 """ 589 return word[-1:]
590 -def secondlast_char(word):
591 """ 592 Return the second last letter example: zerrouki; 'k' is the second last. 593 @param word: given word 594 @type word: unicode 595 @return: the second last letter 596 @rtype: unicode char 597 """ 598 return word[-2:-1]
599 ##################################### 600 #{Strip functions 601 #####################################
602 -def strip_harakat(text):
603 """Strip Harakat from arabic word except Shadda. 604 The striped marks are : 605 - FATHA, DAMMA, KASRA 606 - SUKUN 607 - FATHATAN, DAMMATAN, KASRATAN, , , . 608 Example: 609 >>> text = u"الْعَرَبِيّةُ" 610 >>> stripTashkeel(text) 611 >>> العربيّة 612 @param text: arabic text. 613 @type text: unicode. 614 @return: return a striped text. 615 @rtype: unicode. 616 """ 617 # if text: 618 # return re.sub(HARAKAT_PATTERN, u'', text) 619 # return text 620 if not text: 621 return text 622 elif is_vocalized(text): 623 for char in HARAKAT: 624 text = text.replace(char, '') 625 return text
626 -def strip_lastharaka(text):
627 """Strip the last Haraka from arabic word except Shadda. 628 The striped marks are : 629 - FATHA, DAMMA, KASRA 630 - SUKUN 631 - FATHATAN, DAMMATAN, KASRATAN 632 Example: 633 >>> text = u"الْعَرَبِيّةُ" 634 >>> stripTashkeel(text) 635 >>> الْعَرَبِيّة 636 @param text: arabic text. 637 @type text: unicode. 638 @return: return a striped text. 639 @rtype: unicode. 640 """ 641 if text: 642 if is_vocalized(text): 643 return re.sub(LASTHARAKA_PATTERN, u'', text) 644 return text
645
646 -def strip_tashkeel(text):
647 """Strip vowels from a text, include Shadda. 648 The striped marks are : 649 - FATHA, DAMMA, KASRA 650 - SUKUN 651 - SHADDA 652 - FATHATAN, DAMMATAN, KASRATAN, , , . 653 Example: 654 >>> text = u"الْعَرَبِيّةُ" 655 >>> stripTashkeel(text) 656 العربية 657 @param text: arabic text. 658 @type text: unicode. 659 @return: return a striped text. 660 @rtype: unicode. 661 """ 662 if not text: 663 return text 664 elif is_vocalized(text): 665 for char in TASHKEEL: 666 text = text.replace(char, '') 667 return text
668 -def strip_tatweel(text):
669 """ 670 Strip tatweel from a text and return a result text. 671 Example: 672 >>> text = u"العـــــربية" 673 >>> stripTatweel(text) 674 >>> العربية 675 @param text: arabic text. 676 @type text: unicode. 677 @return: return a striped text. 678 @rtype: unicode. 679 """ 680 return text.replace(TATWEEL, '')
681
682 -def strip_shadda(text):
683 """ 684 Strip Shadda from a text and return a result text. 685 686 Example: 687 >>> text = u"الشّمسيّة" 688 >>> stripTatweel(text) 689 الشمسية 690 691 @param text: arabic text. 692 @type text: unicode. 693 @return: return a striped text. 694 @rtype: unicode. 695 """ 696 return text.replace(SHADDA, '')
697
698 -def normalize_ligature(text):
699 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), 700 and Tand return a result text. 701 Some systems present lamAlef ligature as a single letter, 702 this function convert it into two letters, 703 The converted letters into LAM and ALEF are : 704 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE 705 Example: 706 >>> text = u"لانها لالء الاسلام" 707 >>> normalizeLigature(text) 708 لانها لالئ الاسلام 709 710 @param text: arabic text. 711 @type text: unicode. 712 @return: return a converted text. 713 @rtype: unicode. 714 """ 715 if text: 716 return LIGUATURES_PATTERN.sub(u'%s%s'%(LAM, ALEF), text) 717 return text
718
719 -def normalize_hamza(word):
720 """Standardize the Hamzat into one form of hamza, 721 replace Madda by hamza and alef. 722 Replace the LamAlefs by simplified letters. 723 Example: 724 >>> text = u"سئل أحد الأئمة" 725 >>> normalizeHamza(text) 726 سءل ءحد الءءمة 727 728 @param word: arabic text. 729 @type word: unicode. 730 @return: return a converted text. 731 @rtype: unicode. 732 """ 733 if word.startswith(ALEF_MADDA): 734 if len(word)>= 3 and (word[1] not in HARAKAT) and \ 735 (word[2] == SHADDA or len(word) == 3): 736 word = HAMZA + ALEF + word[1:] 737 else: 738 word = HAMZA + HAMZA + word[1:] 739 # convert all Hamza from into one form 740 word = word.replace(ALEF_MADDA, HAMZA+HAMZA) 741 word = HAMZAT_PATTERN.sub(HAMZA, word) 742 return word
743
744 -def separate(word, extract_shadda = False):
745 """ 746 separate the letters from the vowels, in arabic word, 747 if a letter hasn't a haraka, the not definited haraka is attributed. 748 return ( letters, vowels) 749 @param word: the input word 750 @type word: unicode 751 @param extract_shadda: extract shadda as seperate text 752 @type extract_shadda: Boolean 753 @return: ( letters, vowels) 754 @rtype:couple of unicode 755 """ 756 stack1 = stack.Stack(word) 757 # the word is inversed in the stack 758 stack1.items.reverse() 759 letters = stack.Stack() 760 marks = stack.Stack() 761 vowels = HARAKAT 762 last1 = stack1.pop() 763 # if the last element must be a letter, 764 # the arabic word can't starts with a haraka 765 # in th stack the word is inversed 766 while last1 in vowels: 767 last1 = stack1.pop() 768 while last1 != None: 769 if last1 in vowels: 770 # we can't have two harakats beside. 771 # the shadda is considered as a letter 772 marks.pop() 773 marks.push(last1) 774 elif last1 == SHADDA: 775 # is the element is a Shadda, 776 # the previous letter must have a sukun as mark, 777 # and the shadda take the indefinate mark 778 marks.pop() 779 marks.push(SUKUN) 780 marks.push(NOT_DEF_HARAKA) 781 letters.push(SHADDA) 782 else: 783 marks.push(NOT_DEF_HARAKA) 784 letters.push(last1) 785 last1 = stack1.pop() 786 if extract_shadda: 787 # the shadda is considered as letter 788 wordletters = u''.join(letters.items) 789 # print wordletters.encode('utf8') 790 shaddaplaces = re.sub(ur'[^%s]'%SHADDA, TATWEEL, wordletters) 791 shaddaplaces = re.sub(u'%s%s'%(TATWEEL, SHADDA), SHADDA, shaddaplaces) 792 # print wordletters.encode('utf8') 793 wordletters = strip_shadda(wordletters) 794 # print wordletters.encode('utf8') 795 return (wordletters, u''.join(marks.items), shaddaplaces) 796 else: 797 return (u''.join(letters.items), u''.join(marks.items))
798 799
800 -def joint(letters, marks):
801 """ joint the letters with the marks 802 the length ot letters and marks must be equal 803 return word 804 @param letters: the word letters 805 @type letters: unicode 806 @param marks: the word marks 807 @type marks: unicode 808 @return: word 809 @rtype: unicode 810 """ 811 # The length ot letters and marks must be equal 812 if len(letters) != len(marks): 813 return "" 814 stack_letter = stack.Stack(letters) 815 stack_letter.items.reverse() 816 stack_mark = stack.Stack(marks) 817 stack_mark.items.reverse() 818 819 word_stack = stack.Stack() 820 last_letter = stack_letter.pop() 821 last_mark = stack_mark.pop() 822 vowels = HARAKAT 823 while last_letter != None and last_mark != None: 824 if last_letter == SHADDA: 825 top = word_stack.pop() 826 if top not in vowels: 827 word_stack.push(top) 828 word_stack.push(last_letter) 829 if last_mark != NOT_DEF_HARAKA: 830 word_stack.push(last_mark) 831 else: 832 word_stack.push(last_letter) 833 if last_mark != NOT_DEF_HARAKA: 834 word_stack.push(last_mark) 835 836 last_letter = stack_letter.pop() 837 last_mark = stack_mark.pop() 838 839 if not (stack_letter.is_empty() and stack_mark.is_empty()): 840 return False 841 else: 842 return ''.join(word_stack.items)
843 844
845 -def vocalizedlike(word1, word2):
846 """ 847 if the two words has the same letters and the same harakats, this fuction return True. 848 The two words can be full vocalized, or partial vocalized 849 @param word1: first word 850 @type word1: unicode 851 @param word2: second word 852 @type word2: unicode 853 @return: if two words have similar vocalization 854 @rtype: Boolean 855 """ 856 if vocalized_similarity(word1, word2)<0: 857 return False 858 else: return True
859 860 #------------------------- 861 # Function def vaznlike(word1, wazn): 862 #-------------------------
863 -def waznlike(word1, wazn):
864 """if the word1 is like a wazn (pattern), 865 the letters must be equal, 866 the wazn has FEH, AIN, LAM letters. 867 this are as generic letters. 868 The two words can be full vocalized, or partial vocalized 869 @param word1: input word 870 @type word1: unicode 871 @param wazn: given word template وزن 872 @type wazn: unicode 873 @return: if two words have similar vocalization 874 @rtype: Boolean 875 """ 876 stack1 = stack.Stack(word1) 877 stack2 = stack.Stack(wazn) 878 root = stack.Stack() 879 last1 = stack1.pop() 880 last2 = stack2.pop() 881 vowels = HARAKAT 882 while last1 != None and last2 != None: 883 if last1 == last2 and last2 not in (FEH, AIN, LAM): 884 last1 = stack1.pop() 885 last2 = stack2.pop() 886 elif last1 not in vowels and last2 in (FEH, AIN, LAM): 887 root.push(last1) 888 print "t" 889 last1 = stack1.pop() 890 last2 = stack2.pop() 891 elif last1 in vowels and last2 not in vowels: 892 last1 = stack1.pop() 893 elif last1 not in vowels and last2 in vowels: 894 last2 = stack2.pop() 895 else: 896 break 897 # reverse the root letters 898 root.items.reverse() 899 print " the root is ", root.items#"".join(root.items) 900 if not (stack1.is_empty() and stack2.is_empty()): 901 return False 902 else: return True
903
904 -def shaddalike(partial, fully):
905 """ 906 If the two words has the same letters and the same harakats, this fuction return True. 907 The first word is partially vocalized, the second is fully 908 if the partially contians a shadda, it must be at the same place in the fully 909 @return: if contains shadda 910 @rtype: Boolean 911 """ 912 #المدخل ليس به شدة، لا داعي للبحث 913 if not has_shadda(partial): 914 return True 915 #المدخل به شدة، والنتيجة ليس بها شدة، خاطئ 916 elif not has_shadda(fully) and has_shadda(partial): 917 return False 918 # المدخل والمخرج بهما شدة، نتأكد من موقعهما 919 partial = strip_harakat(partial) 920 fully = strip_harakat(fully) 921 pstack = stack.Stack(partial) 922 vstack = stack.Stack(fully) 923 plast = pstack.pop() 924 vlast = vstack.pop() 925 # if debug: print "+0", Pstack, Vstack 926 while plast != None and vlast != None: 927 if plast == vlast: 928 plast = pstack.pop() 929 vlast = vstack.pop() 930 elif plast == SHADDA and vlast != SHADDA: 931 # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast 932 break 933 elif plast != SHADDA and vlast == SHADDA: 934 # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast 935 vlast = vstack.pop() 936 else: 937 # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast 938 break 939 if not (pstack.is_empty() and vstack.is_empty()): 940 return False 941 else: return True
942
943 -def reduce_tashkeel(text):
944 """Reduce the Tashkeel, by deleting evident cases. 945 @param text: the input text fully vocalized. 946 @type text: unicode. 947 @return : partially vocalized text. 948 @rtype: unicode. 949 """ 950 patterns = [ 951 # delete all fathat, except on waw and yeh 952 u"(?<!(%s|%s))(%s|%s)" % (WAW, YEH, SUKUN, FATHA), 953 #delete damma if followed by waw. 954 u"%s(?=%s)" % (DAMMA, WAW), 955 #delete kasra if followed by yeh. 956 u"%s(?=%s)" % (KASRA, YEH), 957 #delete fatha if followed by alef to reduce yeh maftouha 958 # and waw maftouha before alef. 959 u"%s(?=%s)" % (FATHA, ALEF), 960 #delete fatha from yeh and waw if they are in the word begining. 961 ur"(?<=\s(%s|%s))%s" % (WAW, YEH, FATHA), 962 #delete kasra if preceded by Hamza below alef. 963 u"(?<=%s)%s" % (ALEF_HAMZA_BELOW, KASRA), 964 ] 965 reduced = text 966 for pat in patterns: 967 reduced = re.sub(pat, '', reduced) 968 return reduced
969
970 -def vocalized_similarity(word1, word2):
971 """ 972 if the two words has the same letters and the same harakats, this function return True. 973 The two words can be full vocalized, or partial vocalized 974 @param word1: first word 975 @type word1: unicode 976 @param word2: second word 977 @type word2: unicode 978 @return: return if words are similar, else return negative number of errors 979 @rtype: Boolean / int 980 """ 981 982 stack1 = stack.Stack(word1) 983 stack2 = stack.Stack(word2) 984 last1 = stack1.pop() 985 last2 = stack2.pop() 986 err_count = 0 987 vowels = HARAKAT 988 while last1 != None and last2 != None: 989 if last1 == last2: 990 last1 = stack1.pop() 991 last2 = stack2.pop() 992 elif last1 in vowels and last2 not in vowels: 993 last1 = stack1.pop() 994 elif last1 not in vowels and last2 in vowels: 995 last2 = stack2.pop() 996 else: 997 #break 998 if last1 == SHADDA: 999 last1 = stack1.pop() 1000 elif last2 == SHADDA: 1001 last2 = stack2.pop() 1002 else: 1003 last1 = stack1.pop() 1004 last2 = stack2.pop() 1005 err_count += 1 1006 if err_count > 0 : 1007 return -err_count 1008 else: return True
1009
1010 -def tokenize(text = u""):
1011 """ 1012 Tokenize text into words 1013 @param text: the input text. 1014 @type text: unicode. 1015 @return: list of words. 1016 @rtype: list. 1017 """ 1018 if text == u'': 1019 return [] 1020 else: 1021 #split tokens 1022 mylist = TOKEN_PATTERN.split(text) 1023 # don't remove newline \n 1024 mylist = [TOKEN_REPLACE.sub('',x) for x in mylist if x] 1025 # remove empty substring 1026 mylist = [x for x in mylist if x] 1027 return mylist
1028 1029 if __name__ == "__main__": 1030 #~WORDS = [u'الْدَرَاجَةُ', u'الدّرّاجة', 1031 #~u'سّلّامْ', ] 1032 #~for wrd in WORDS: 1033 #~l, m, s = separate(wrd, True) 1034 #~l = joint(l, s) 1035 #~print u'\t'.join([wrd, l, m, s]).encode('utf8') 1036 #~newword = joint(l, m) 1037 #~assert (newword != wrd) 1038 1039 print "like: ", vocalizedlike(u'مُتَوَهِّمًا', u'متوهمًا') 1040 print "sim: ", vocalized_similarity(u'ثمّ', u'ثُمَّ') 1041 print "like: ", vocalizedlike(u'ثمّ', u'ثُمَّ') 1042 print "sim: ", vocalized_similarity(u'ثم', u'ثُمَّ') 1043 print "like: ", vocalizedlike(u'ثم', u'ثُمَّ') 1044 print "sim: ", vocalized_similarity(u'مُتَوَهِّمًا', u'متوهمًا') 1045 print "sim: ", vocalized_similarity(u'مُتَوَهِّمًا', u'متوهمًا') 1046