Package pyarabic :: Module named
[hide private]
[frames] | no frames]

Source Code for Module pyarabic.named

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  # 
  4  """ 
  5  Arabic Named enteties recognation pyarabic.named 
  6  """ 
  7  import sys 
  8  #~if __name__ == '__main__': 
  9      #~import  araby 
 10      #~import  named_const 
 11      #~import  propernouns     
 12  #~else: 
 13      #~sys.path.append('../lib') 
 14      #~import pyarabic.araby as araby 
 15      #~import pyarabic.named_const as named_const 
 16      #~import pyarabic.propernouns as propernouns 
 17  sys.path.append('../../lib') 
 18  import pyarabic.araby as araby 
 19  import pyarabic.named_const as named_const 
 20  import pyarabic.propernouns as propernouns 
 21  # from number import * 
 22  DINENAMED = ( 
 23  u'شمس', 
 24  u'تقي', 
 25  u'علاء', 
 26  u'نجم', 
 27  u'نور', 
 28  u'سيف', 
 29  #u'', 
 30  #u'', 
 31   
 32  ) 
33 -def is_proper_noun(word):
34 """ 35 Test if the word is a proper noun 36 @param word: given word 37 @type word: unicode 38 @return: True if is properword 39 @rtype: Boolean 40 """ 41 # return word in named_const.ProperNouns 42 return propernouns.ProperNouns.has_key(word)
43
44 -def detect_named_position(wordlist):
45 """ 46 Detect named enteties words in a text and return positions of each phrase. 47 48 Example: 49 >>> detect_named_position(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله") 50 ((1,3), (6,8)) 51 @param wordlist: wordlist 52 @type wordlist: unicode list 53 @return : list of numbers clause positions [(start,end),(start2,end2),] 54 @rtype: list of tuple 55 """ 56 #~ wordlist#=text.split(u' ') 57 #print words 58 positions = [] 59 startnamed = -1 60 endnamed = False 61 # print u":".join(wordlist).encode('utf8') 62 for i in range(len(wordlist)): 63 word = wordlist[i] 64 if i+1 < len(wordlist): 65 nextword = araby.strip_tashkeel(wordlist[i+1]) 66 else: nextword = u'' 67 if i-1 >= 0: 68 previous = araby.strip_tashkeel(wordlist[i-1]) 69 if previous and startnamed < 0 and\ 70 previous[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): 71 previous = previous[1:] 72 else: 73 previous = u'' 74 #save the original word with possible harakat if exist 75 word_nm = araby.strip_tashkeel(word) 76 key = word_nm 77 # the first word can have prefixes 78 if word_nm and startnamed < 0 and\ 79 word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): 80 key = word_nm[1:] 81 if startnamed < 0 and key in (u'ابن', ): 82 startnamed = i 83 endnamed = i 84 85 elif key in (u'ابن', u'بن', u'أبو', u'أبا', \ 86 u'أبي', u'عبد' , u'عبيد' , u'بنو', u'بني', u'بنت'): 87 if startnamed < 0: 88 startnamed = i 89 endnamed = i 90 91 elif previous in (u'بن', u'ابن', u'أبو', u'أبا', \ 92 u'أبي', u'عبد', u'عبيد', u'بنو', u'بني', u'بنت'): 93 if startnamed < 0: 94 startnamed = i-1 95 endnamed = i 96 elif nextword in (u'بن', u'بنت',): 97 # u'أبو', u'أبي', u'ابا',) :#or word in (u'الدين',): 98 if startnamed < 0: 99 startnamed = i 100 endnamed = i 101 # if the word is a proper noun 102 elif startnamed < 0 and is_proper_noun(key): 103 startnamed = i 104 endnamed = i 105 else: 106 if startnamed >= 0: #There are a previous number phrase. 107 if word_nm.startswith(u'ال') and word_nm.endswith(u'ي'): 108 # add family name إضافة الكنية 109 endnamed = i 110 111 positions.append((startnamed, endnamed)) 112 startnamed = -1 113 # add the final phrases 114 if startnamed >= 0: #There are a previous number phrase. 115 positions.append((startnamed, endnamed)) 116 return positions
117
118 -def extract_named(text):
119 """ 120 Extract named enteties words in a text. 121 122 Example: 123 >>> extract_named(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله") 124 ("خالد بن رافع"، "أحمد بن عنبر ") 125 @param text: input text 126 @type text: unicode 127 @return : named enteties words extracted from text 128 @rtype: integer 129 """ 130 phrases = [] 131 wordlist = araby.tokenize(text) 132 positions = detect_named_position(wordlist) 133 134 for pos in positions: 135 if len(pos) >= 2: 136 if pos[0] <= len(wordlist) and pos[1] <= len(wordlist): 137 phrases.append(u' '.join(wordlist[pos[0]: pos[1]+1])) 138 return phrases
139 140
141 -def extract_named_within_context(text):
142 """ 143 Extract number words in a text. 144 145 Example: 146 >>> extractNumberPhrasesWithinContext(u"تصدق عبد الله بن عمر بدينار") 147 ("تصدق"، "عبد الله بن عمر"، "بدينار") 148 149 @param text: input text 150 @type text: unicode 151 @return : number words extracted from text 152 @rtype: integer 153 """ 154 phrases = [] 155 wordlist = araby.tokenize(text) 156 positions = detect_named_position(wordlist) 157 for pos in positions: 158 # print pos 159 if len(pos) >= 2: 160 if pos[0] <= len(wordlist) and pos[1] <= len(wordlist): 161 if pos[0]-1 >= 0: 162 previous = wordlist[pos[0]-1] 163 else: previous = u'' 164 if pos[1]+1 < len(wordlist): 165 nextword = wordlist[pos[1]+1] 166 else: nextword = u'' 167 phrases.append((previous, \ 168 u' '.join(wordlist[pos[0]: pos[1]+1]), nextword)) 169 return phrases
170
171 -def detect_named(text):
172 """ 173 Detect named enteties in a text 174 175 Example: 176 >>> text2number(u"وجد عبد الله بن عمر دينارا") 177 عبد الله بن عمر 178 @param text: input text 179 @type text: unicode 180 @return : extract named enteties 181 @rtype: integer 182 """ 183 words = araby.tokenize(text) 184 phrase = [] 185 phrases = [] 186 previous = u"" 187 for i in range(len(words)): 188 word = words[i] 189 if i+1 < len(words): 190 nextword = words[i+1] 191 else: 192 nextword = u"" 193 key = word 194 # the first word can have prefixes 195 if not phrase and word and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): 196 key = word[1:] 197 198 if not phrase and key in (u'ابن', ): 199 phrase.append(word) 200 201 elif key in (u'بن', u'أبو', u'أبا', u'أبي', u'عبد' , ): 202 phrase.append(word) 203 204 elif previous in (u'بن', u'ابن', u'أبو', u'أبا', u'أبي', u'عبد',): 205 phrase.append(word) 206 elif nextword in (u'بن', u'عبد' , u'أبو', u'أبي') \ 207 and word in (u'الدين',): 208 phrase.append(previous) 209 phrase.append(word) 210 elif nextword in (u'بن', ) : 211 phrase.append(word) 212 else: 213 if len(phrase) >= 1: 214 if word.startswith(u'ال') and word.endswith(u'ي'): 215 phrase.append(word) 216 phrases.append(u" ".join(phrase)) 217 #~phrases.append(u"".join(phrase)) 218 phrase = [] 219 previous = key 220 # add the final phrases 221 if phrase: 222 #~phrases.append(u" ".join(phrase)) 223 phrases.append(u"".join(phrase)) 224 return phrases
225
226 -def get_previous_tag(word):
227 """Get the word tags 228 @param word: given word 229 @type word: unicode 230 @return :word tag 231 @rtype: unicode 232 """ 233 word = araby.strip_tashkeel(word) 234 #~ tags = u'' 235 if word in named_const.NOUN_NASEB_LIST: 236 return u'منصوب' 237 elif word in named_const.JAR_LIST: 238 return u'مجرور' 239 elif word in named_const.RAFE3_LIST: 240 return u'مرفوع' 241 else: 242 return u''
243
244 -def vocalize_named(wordlist, syn_tags = ""):
245 """ Vocalize a number words 246 @param wordlist: words to vocalize 247 @type wordlist: unicode list 248 @param syn_tags: tags about the clause 249 @type syn_tags: unicode 250 @return: the vocalized wordlist. 251 @rtype: unicode 252 """ 253 newlist = [] 254 #detect tags 255 # we can pass tags to this number word 256 tags = syn_tags 257 bin_count = 0 258 for i in range(len(wordlist)): 259 #save the original word with possible harakat if exist 260 word = wordlist[i] 261 word_nm = araby.strip_tashkeel(word) 262 # the first word can have prefixes 263 if i == 0 and word_nm: 264 # word to get majrour tag 265 if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',): 266 tags += u"مجرور" 267 elif word_nm in (u'أبو', ): 268 tags += u"مرفوع" 269 elif word_nm in (u'أبا', ): 270 tags += u"منصوب" 271 # select vocalization 272 273 if word_nm == u'بن': 274 bin_count += 1 275 #treat first bin according to tags 276 if bin_count == 1: 277 if u'مجرور' in tags: 278 voc = u'بْنِ' 279 elif u'مرفوع' in tags: 280 voc = u'بْنُ' 281 elif u'منصوب' in tags: 282 voc = u'بْنَ' 283 else: 284 voc = u'بْن' 285 else: 286 # u'مجرور' 287 voc = u'بْنِ' 288 #Todo Vocalize names 289 else: 290 voc = word 291 newlist.append(voc) 292 return newlist
293
294 -def pretashkeel_named(wordlist):
295 """ 296 Detect named words in a text. 297 Example: 298 >>> preTashkeelNumber(u"وجدت خمسمئة وثلاثة وعشرين دينارا") 299 وجدت خمسمئة وثلاثة وعشرين دينارا 300 @param wordlist: input text 301 @type wordlist: unicode 302 @return : wordlist with vocalized named clause 303 @rtype: list 304 """ 305 306 positions = detect_named_position(wordlist) 307 #print positions 308 for pos in positions: 309 if len(pos) >= 2: 310 startpos = pos[0] 311 endpos = pos[1] 312 if startpos <= len(wordlist) and endpos <= len(wordlist): 313 # get the context of current number phrase 314 if startpos-1 >= 0: 315 previous = wordlist[startpos-1] 316 else: 317 previous = u'' 318 #get the tag of previous word 319 tags = get_previous_tag(previous) 320 vocalized = vocalize_named(\ 321 wordlist[startpos:endpos+1], tags) 322 wordlist = wordlist[:startpos] + vocalized + wordlist[endpos+1:] 323 return wordlist
324 325 if __name__ == '__main__': 326 #import number as ArabicNumberToLetters 327 TEXTS = [ 328 u"وجد عبد الله بن عمر دينارا", 329 330 u"جاء خالد بن الوليد وقاتل مسيلمة بن حذام الكذاب في موقعة الحديقة", 331 u'''روى أحمد بن عقيل الشامي عن أبي طلحة 332 المغربي أنّ عقابا بن مسعود بن أبي سعاد قال''', 333 u"قال مُحَمَّدُ بْنُ خَالِدُ بْنُ إسماعيلفي حديثه", 334 u"ِنْصَرَفْنَا إِلَى أَنَسُ بْنُ مَالِكَ الْحَديثِ" 335 ] 336 for text1 in TEXTS: 337 positions_named = detect_named_position(text1.split(' ')) 338 print positions_named 339 text1 = araby.strip_tashkeel(text1) 340 result = pretashkeel_named(araby.tokenize(text1)) 341 print u' '.join(result).encode('utf8') 342