Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

"""Newick format with all features as per the specs at: 

http://evolution.genetics.washington.edu/phylip/newick_doc.html 

http://evolution.genetics.washington.edu/phylip/newicktree.html 

ie: 

Unquoted label underscore munging 

Quoted labels 

Inner node labels 

Lengths 

[ ... ] Comments (discarded) 

Unlabeled tips 

also: 

Double quotes can be used. 

Spaces and quote marks are OK inside unquoted labels. 

""" 

 

#from cogent.parse.record import FileFormatError 

import re 

EOT = None 

 

__author__ = "Peter Maxwell" 

__copyright__ = "Copyright 2007-2011, The Cogent Project" 

__credits__ = ["Peter Maxwell", "Andrew Butterfield", "Gavin Huttley"] 

__license__ = "GPL" 

__version__ = "1.5.1" 

__maintainer__ = "Peter Maxwell" 

__email__ = "pm67nz@gmail.com" 

__status__ = "Production" 

 

class TreeParseError(ValueError): 

pass 

 

 

class _Tokeniser(object): 

"""Supplies an iterable stream of Newick tokens from 'text' 

 

By default this is very forgiving of non-standard unquoted labels. 

Two options can change how unquoted labels are interpreted: 

To prohibit internal spaces and quotes set strict_labels=True. 

To disable conversion of '_' to ' ' set underscore_unmunge=False. 

 

NOTE: underscore_unmunging is part of the Newick standard, although it 

is often inconvenient for other purposes. 

""" 

 

def __init__(self, text, strict_labels=False, underscore_unmunge=True): 

self.text = text 

self.posn = None 

self.strict_unquoted_labels = strict_labels 

self.underscore_unmunge = underscore_unmunge 

 

def error(self, detail=""): 

if self.token: 

msg = 'Unexpected "%s" at ' % self.token 

else: 

msg = 'At ' 

(line, column) = self.posn 

sample = self.text.split('\n')[line][:column] 

if column > 30: 

sample = "..." + sample[-20:] 

if line > 0: 

msg += 'line %s:%s "%s"' % (line+1, column, sample) 

else: 

msg += 'char %s "%s"' % (column, sample) 

return TreeParseError(msg + '. ' + detail) 

 

def tokens(self): 

closing_quote_token = None 

column = 0 

line = 0 

text = None 

closing_quote_token = None 

in_comment = False 

for token in re.split("""([\\t ]+|\\n|''|""|[]['"(),:;])""", self.text)+[EOT]: 

label_complete = False 

token_consumed = True 

self.token = token 

column += len(token or '') 

self.posn = (line, column) 

 

if token == "": 

pass 

elif in_comment: 

if token is EOT: 

raise self.error('Ended with unclosed comment') 

if token == ']': 

in_comment = False 

elif closing_quote_token: 

if token is EOT: 

raise self.error('Text ended inside quoted label') 

if token == '\n': 

raise self.error('Line ended inside quoted label') 

if token == closing_quote_token: 

label_complete = True 

closing_quote_token = None 

else: 

if token == closing_quote_token*2: 

token = token[0] 

text += token 

elif token is EOT or token in '\n[():,;': 

if text: 

text = text.strip() 

if self.underscore_unmunge and '_' in text: 

text = text.replace('_', ' ') 

label_complete = True 

if token == '\n': 

line += 1 

column = 1 

elif token == '[': 

in_comment = True 

else: 

token_consumed = False 

elif text is not None: 

text += token 

elif token in ["''", '""']: 

label_complete = True 

text = "" 

elif token in ["'", '"']: 

closing_quote_token = token 

text = "" 

elif token.strip(): 

text = token 

label_complete = self.strict_unquoted_labels 

 

if label_complete: 

self.token = None 

yield text 

text = None 

 

if not token_consumed: 

self.token = token 

yield token 

 

def parse_string(text, constructor, **kw): 

"""Parses a Newick-format string, using specified constructor for tree. 

 

Calls constructor(children, name, attributes) 

 

Note: underscore_unmunge, if True, replaces underscores with spaces in 

the data that's read in. This is part of the Newick format, but it is 

often useful to suppress this behavior. 

""" 

if "(" not in text and ";" not in text and text.strip(): 

# otherwise "filename" is a valid (if small) tree 

raise TreeParseError('Not a Newick tree: "%s"' % text[:10]) 

sentinals = [';', EOT] 

stack = [] 

nodes = [] 

children = name = expected_attribute = None 

attributes = {} 

tokeniser = _Tokeniser(text, **kw) 

for token in tokeniser.tokens(): 

if expected_attribute is not None: 

(attr_name, attr_cast) = expected_attribute 

try: 

attributes[attr_name] = attr_cast(token) 

except ValueError: 

raise tokeniser.error("Can't convert %s '%s'" % 

(attr_name, token)) 

expected_attribute = None 

elif token == '(': 

if children is not None: 

raise tokeniser.error( 

"Two subtrees in one node, missing comma?") 

elif name or attributes: 

raise tokeniser.error( 

"Subtree must be first element of the node.") 

stack.append((nodes, sentinals, attributes)) 

(nodes, sentinals, attributes) = ([], [')'], {}) 

elif token == ':': 

if 'length' in attributes: 

raise tokeniser.error("Already have a length.") 

expected_attribute = ('length', float) 

elif token in [')', ';', ',', EOT]: 

nodes.append(constructor(children, name, attributes)) 

children = name = expected_attribute = None 

attributes = {} 

if token in sentinals: 

if stack: 

children = nodes 

(nodes, sentinals, attributes) = stack.pop() 

else: 

break 

elif token == ',' and ')' in sentinals: 

pass 

else: 

raise tokeniser.error("Was expecting to end with %s" % 

' or '.join([repr(s) for s in sentinals])) 

else: 

if name is not None: 

raise tokeniser.error("Already have a name '%s' for this node." % name) 

elif attributes: 

raise tokeniser.error("Name should come before length.") 

name = token 

assert not stack, stack 

assert len(nodes) == 1, len(nodes) 

return nodes[0]