Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3 pygments.regexopt 

4 ~~~~~~~~~~~~~~~~~ 

5 

6 An algorithm that generates optimized regexes for matching long lists of 

7 literal strings. 

8 

9 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS. 

10 :license: BSD, see LICENSE for details. 

11""" 

12 

13import re 

14from re import escape 

15from os.path import commonprefix 

16from itertools import groupby 

17from operator import itemgetter 

18 

19CS_ESCAPE = re.compile(r'[\^\\\-\]]') 

20FIRST_ELEMENT = itemgetter(0) 

21 

22 

23def make_charset(letters): 

24 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' 

25 

26 

27def regex_opt_inner(strings, open_paren): 

28 """Return a regex that matches any string in the sorted list of strings.""" 

29 close_paren = open_paren and ')' or '' 

30 # print strings, repr(open_paren) 

31 if not strings: 

32 # print '-> nothing left' 

33 return '' 

34 first = strings[0] 

35 if len(strings) == 1: 

36 # print '-> only 1 string' 

37 return open_paren + escape(first) + close_paren 

38 if not first: 

39 # print '-> first string empty' 

40 return open_paren + regex_opt_inner(strings[1:], '(?:') \ 

41 + '?' + close_paren 

42 if len(first) == 1: 

43 # multiple one-char strings? make a charset 

44 oneletter = [] 

45 rest = [] 

46 for s in strings: 

47 if len(s) == 1: 

48 oneletter.append(s) 

49 else: 

50 rest.append(s) 

51 if len(oneletter) > 1: # do we have more than one oneletter string? 

52 if rest: 

53 # print '-> 1-character + rest' 

54 return open_paren + regex_opt_inner(rest, '') + '|' \ 

55 + make_charset(oneletter) + close_paren 

56 # print '-> only 1-character' 

57 return open_paren + make_charset(oneletter) + close_paren 

58 prefix = commonprefix(strings) 

59 if prefix: 

60 plen = len(prefix) 

61 # we have a prefix for all strings 

62 # print '-> prefix:', prefix 

63 return open_paren + escape(prefix) \ 

64 + regex_opt_inner([s[plen:] for s in strings], '(?:') \ 

65 + close_paren 

66 # is there a suffix? 

67 strings_rev = [s[::-1] for s in strings] 

68 suffix = commonprefix(strings_rev) 

69 if suffix: 

70 slen = len(suffix) 

71 # print '-> suffix:', suffix[::-1] 

72 return open_paren \ 

73 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ 

74 + escape(suffix[::-1]) + close_paren 

75 # recurse on common 1-string prefixes 

76 # print '-> last resort' 

77 return open_paren + \ 

78 '|'.join(regex_opt_inner(list(group[1]), '') 

79 for group in groupby(strings, lambda s: s[0] == first[0])) \ 

80 + close_paren 

81 

82 

83def regex_opt(strings, prefix='', suffix=''): 

84 """Return a compiled regex that matches any string in the given list. 

85 

86 The strings to match must be literal strings, not regexes. They will be 

87 regex-escaped. 

88 

89 *prefix* and *suffix* are pre- and appended to the final regex. 

90 """ 

91 strings = sorted(strings) 

92 return prefix + regex_opt_inner(strings, '(') + suffix