Package pyarabic ::
Module named
|
|
1
2
3
4 """
5 Arabic Named enteties recognation pyarabic.named
6 """
7 import sys
8
9
10
11
12
13
14
15
16
17 sys.path.append('../../lib')
18 import pyarabic.araby as araby
19 import pyarabic.named_const as named_const
20 import pyarabic.propernouns as propernouns
21
22 DINENAMED = (
23 u'شمس',
24 u'تقي',
25 u'علاء',
26 u'نجم',
27 u'نور',
28 u'سيف',
29
30
31
32 )
34 """
35 Test if the word is a proper noun
36 @param word: given word
37 @type word: unicode
38 @return: True if is properword
39 @rtype: Boolean
40 """
41
42 return propernouns.ProperNouns.has_key(word)
43
45 """
46 Detect named enteties words in a text and return positions of each phrase.
47
48 Example:
49 >>> detect_named_position(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله")
50 ((1,3), (6,8))
51 @param wordlist: wordlist
52 @type wordlist: unicode list
53 @return : list of numbers clause positions [(start,end),(start2,end2),]
54 @rtype: list of tuple
55 """
56
57
58 positions = []
59 startnamed = -1
60 endnamed = False
61
62 for i in range(len(wordlist)):
63 word = wordlist[i]
64 if i+1 < len(wordlist):
65 nextword = araby.strip_tashkeel(wordlist[i+1])
66 else: nextword = u''
67 if i-1 >= 0:
68 previous = araby.strip_tashkeel(wordlist[i-1])
69 if previous and startnamed < 0 and\
70 previous[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
71 previous = previous[1:]
72 else:
73 previous = u''
74
75 word_nm = araby.strip_tashkeel(word)
76 key = word_nm
77
78 if word_nm and startnamed < 0 and\
79 word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
80 key = word_nm[1:]
81 if startnamed < 0 and key in (u'ابن', ):
82 startnamed = i
83 endnamed = i
84
85 elif key in (u'ابن', u'بن', u'أبو', u'أبا', \
86 u'أبي', u'عبد' , u'عبيد' , u'بنو', u'بني', u'بنت'):
87 if startnamed < 0:
88 startnamed = i
89 endnamed = i
90
91 elif previous in (u'بن', u'ابن', u'أبو', u'أبا', \
92 u'أبي', u'عبد', u'عبيد', u'بنو', u'بني', u'بنت'):
93 if startnamed < 0:
94 startnamed = i-1
95 endnamed = i
96 elif nextword in (u'بن', u'بنت',):
97
98 if startnamed < 0:
99 startnamed = i
100 endnamed = i
101
102 elif startnamed < 0 and is_proper_noun(key):
103 startnamed = i
104 endnamed = i
105 else:
106 if startnamed >= 0:
107 if word_nm.startswith(u'ال') and word_nm.endswith(u'ي'):
108
109 endnamed = i
110
111 positions.append((startnamed, endnamed))
112 startnamed = -1
113
114 if startnamed >= 0:
115 positions.append((startnamed, endnamed))
116 return positions
117
119 """
120 Extract named enteties words in a text.
121
122 Example:
123 >>> extract_named(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله")
124 ("خالد بن رافع"، "أحمد بن عنبر ")
125 @param text: input text
126 @type text: unicode
127 @return : named enteties words extracted from text
128 @rtype: integer
129 """
130 phrases = []
131 wordlist = araby.tokenize(text)
132 positions = detect_named_position(wordlist)
133
134 for pos in positions:
135 if len(pos) >= 2:
136 if pos[0] <= len(wordlist) and pos[1] <= len(wordlist):
137 phrases.append(u' '.join(wordlist[pos[0]: pos[1]+1]))
138 return phrases
139
140
142 """
143 Extract number words in a text.
144
145 Example:
146 >>> extractNumberPhrasesWithinContext(u"تصدق عبد الله بن عمر بدينار")
147 ("تصدق"، "عبد الله بن عمر"، "بدينار")
148
149 @param text: input text
150 @type text: unicode
151 @return : number words extracted from text
152 @rtype: integer
153 """
154 phrases = []
155 wordlist = araby.tokenize(text)
156 positions = detect_named_position(wordlist)
157 for pos in positions:
158
159 if len(pos) >= 2:
160 if pos[0] <= len(wordlist) and pos[1] <= len(wordlist):
161 if pos[0]-1 >= 0:
162 previous = wordlist[pos[0]-1]
163 else: previous = u''
164 if pos[1]+1 < len(wordlist):
165 nextword = wordlist[pos[1]+1]
166 else: nextword = u''
167 phrases.append((previous, \
168 u' '.join(wordlist[pos[0]: pos[1]+1]), nextword))
169 return phrases
170
172 """
173 Detect named enteties in a text
174
175 Example:
176 >>> text2number(u"وجد عبد الله بن عمر دينارا")
177 عبد الله بن عمر
178 @param text: input text
179 @type text: unicode
180 @return : extract named enteties
181 @rtype: integer
182 """
183 words = araby.tokenize(text)
184 phrase = []
185 phrases = []
186 previous = u""
187 for i in range(len(words)):
188 word = words[i]
189 if i+1 < len(words):
190 nextword = words[i+1]
191 else:
192 nextword = u""
193 key = word
194
195 if not phrase and word and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
196 key = word[1:]
197
198 if not phrase and key in (u'ابن', ):
199 phrase.append(word)
200
201 elif key in (u'بن', u'أبو', u'أبا', u'أبي', u'عبد' , ):
202 phrase.append(word)
203
204 elif previous in (u'بن', u'ابن', u'أبو', u'أبا', u'أبي', u'عبد',):
205 phrase.append(word)
206 elif nextword in (u'بن', u'عبد' , u'أبو', u'أبي') \
207 and word in (u'الدين',):
208 phrase.append(previous)
209 phrase.append(word)
210 elif nextword in (u'بن', ) :
211 phrase.append(word)
212 else:
213 if len(phrase) >= 1:
214 if word.startswith(u'ال') and word.endswith(u'ي'):
215 phrase.append(word)
216 phrases.append(u" ".join(phrase))
217
218 phrase = []
219 previous = key
220
221 if phrase:
222
223 phrases.append(u"".join(phrase))
224 return phrases
225
227 """Get the word tags
228 @param word: given word
229 @type word: unicode
230 @return :word tag
231 @rtype: unicode
232 """
233 word = araby.strip_tashkeel(word)
234
235 if word in named_const.NOUN_NASEB_LIST:
236 return u'منصوب'
237 elif word in named_const.JAR_LIST:
238 return u'مجرور'
239 elif word in named_const.RAFE3_LIST:
240 return u'مرفوع'
241 else:
242 return u''
243
245 """ Vocalize a number words
246 @param wordlist: words to vocalize
247 @type wordlist: unicode list
248 @param syn_tags: tags about the clause
249 @type syn_tags: unicode
250 @return: the vocalized wordlist.
251 @rtype: unicode
252 """
253 newlist = []
254
255
256 tags = syn_tags
257 bin_count = 0
258 for i in range(len(wordlist)):
259
260 word = wordlist[i]
261 word_nm = araby.strip_tashkeel(word)
262
263 if i == 0 and word_nm:
264
265 if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
266 tags += u"مجرور"
267 elif word_nm in (u'أبو', ):
268 tags += u"مرفوع"
269 elif word_nm in (u'أبا', ):
270 tags += u"منصوب"
271
272
273 if word_nm == u'بن':
274 bin_count += 1
275
276 if bin_count == 1:
277 if u'مجرور' in tags:
278 voc = u'بْنِ'
279 elif u'مرفوع' in tags:
280 voc = u'بْنُ'
281 elif u'منصوب' in tags:
282 voc = u'بْنَ'
283 else:
284 voc = u'بْن'
285 else:
286
287 voc = u'بْنِ'
288
289 else:
290 voc = word
291 newlist.append(voc)
292 return newlist
293
295 """
296 Detect named words in a text.
297 Example:
298 >>> preTashkeelNumber(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
299 وجدت خمسمئة وثلاثة وعشرين دينارا
300 @param wordlist: input text
301 @type wordlist: unicode
302 @return : wordlist with vocalized named clause
303 @rtype: list
304 """
305
306 positions = detect_named_position(wordlist)
307
308 for pos in positions:
309 if len(pos) >= 2:
310 startpos = pos[0]
311 endpos = pos[1]
312 if startpos <= len(wordlist) and endpos <= len(wordlist):
313
314 if startpos-1 >= 0:
315 previous = wordlist[startpos-1]
316 else:
317 previous = u''
318
319 tags = get_previous_tag(previous)
320 vocalized = vocalize_named(\
321 wordlist[startpos:endpos+1], tags)
322 wordlist = wordlist[:startpos] + vocalized + wordlist[endpos+1:]
323 return wordlist
324
325 if __name__ == '__main__':
326
327 TEXTS = [
328 u"وجد عبد الله بن عمر دينارا",
329
330 u"جاء خالد بن الوليد وقاتل مسيلمة بن حذام الكذاب في موقعة الحديقة",
331 u'''روى أحمد بن عقيل الشامي عن أبي طلحة
332 المغربي أنّ عقابا بن مسعود بن أبي سعاد قال''',
333 u"قال مُحَمَّدُ بْنُ خَالِدُ بْنُ إسماعيلفي حديثه",
334 u"ِنْصَرَفْنَا إِلَى أَنَسُ بْنُ مَالِكَ الْحَديثِ"
335 ]
336 for text1 in TEXTS:
337 positions_named = detect_named_position(text1.split(' '))
338 print positions_named
339 text1 = araby.strip_tashkeel(text1)
340 result = pretashkeel_named(araby.tokenize(text1))
341 print u' '.join(result).encode('utf8')
342