Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, write to the Free Software 

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

26# 02110-1301 USA 

27######################### END LICENSE BLOCK ######################### 

28 

29import logging 

30import re 

31 

32from .enums import ProbingState 

33 

34 

35class CharSetProber(object): 

36 

37 SHORTCUT_THRESHOLD = 0.95 

38 

39 def __init__(self, lang_filter=None): 

40 self._state = None 

41 self.lang_filter = lang_filter 

42 self.logger = logging.getLogger(__name__) 

43 

44 def reset(self): 

45 self._state = ProbingState.DETECTING 

46 

47 @property 

48 def charset_name(self): 

49 return None 

50 

51 def feed(self, buf): 

52 pass 

53 

54 @property 

55 def state(self): 

56 return self._state 

57 

58 def get_confidence(self): 

59 return 0.0 

60 

61 @staticmethod 

62 def filter_high_byte_only(buf): 

63 buf = re.sub(b'([\x00-\x7F])+', b' ', buf) 

64 return buf 

65 

66 @staticmethod 

67 def filter_international_words(buf): 

68 """ 

69 We define three types of bytes: 

70 alphabet: english alphabets [a-zA-Z] 

71 international: international characters [\x80-\xFF] 

72 marker: everything else [^a-zA-Z\x80-\xFF] 

73 

74 The input buffer can be thought to contain a series of words delimited 

75 by markers. This function works to filter all words that contain at 

76 least one international character. All contiguous sequences of markers 

77 are replaced by a single space ascii character. 

78 

79 This filter applies to all scripts which do not use English characters. 

80 """ 

81 filtered = bytearray() 

82 

83 # This regex expression filters out only words that have at-least one 

84 # international character. The word may include one marker character at 

85 # the end. 

86 words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', 

87 buf) 

88 

89 for word in words: 

90 filtered.extend(word[:-1]) 

91 

92 # If the last character in the word is a marker, replace it with a 

93 # space as markers shouldn't affect our analysis (they are used 

94 # similarly across all languages and may thus have similar 

95 # frequencies). 

96 last_char = word[-1:] 

97 if not last_char.isalpha() and last_char < b'\x80': 

98 last_char = b' ' 

99 filtered.extend(last_char) 

100 

101 return filtered 

102 

103 @staticmethod 

104 def filter_with_english_letters(buf): 

105 """ 

106 Returns a copy of ``buf`` that retains only the sequences of English 

107 alphabet and high byte characters that are not between <> characters. 

108 Also retains English alphabet and high byte characters immediately 

109 before occurrences of >. 

110 

111 This filter can be applied to all scripts which contain both English 

112 characters and extended ASCII characters, but is currently only used by 

113 ``Latin1Prober``. 

114 """ 

115 filtered = bytearray() 

116 in_tag = False 

117 prev = 0 

118 

119 for curr in range(len(buf)): 

120 # Slice here to get bytes instead of an int with Python 3 

121 buf_char = buf[curr:curr + 1] 

122 # Check if we're coming out of or entering an HTML tag 

123 if buf_char == b'>': 

124 in_tag = False 

125 elif buf_char == b'<': 

126 in_tag = True 

127 

128 # If current character is not extended-ASCII and not alphabetic... 

129 if buf_char < b'\x80' and not buf_char.isalpha(): 

130 # ...and we're not in a tag 

131 if curr > prev and not in_tag: 

132 # Keep everything after last non-extended-ASCII, 

133 # non-alphabetic character 

134 filtered.extend(buf[prev:curr]) 

135 # Output a space to delimit stretch we kept 

136 filtered.extend(b' ') 

137 prev = curr + 1 

138 

139 # If we're not in a tag... 

140 if not in_tag: 

141 # Keep everything after last non-extended-ASCII, non-alphabetic 

142 # character 

143 filtered.extend(buf[prev:]) 

144 

145 return filtered