Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

# Copyright 2008-2010 by Peter Cock.  All rights reserved. 

# 

# This code is part of the Biopython distribution and governed by its 

# license.  Please see the LICENSE file that should have been included 

# as part of this package. 

"""Bio.AlignIO support for the "nexus" file format. 

 

You are expected to use this module via the Bio.AlignIO functions (or the 

Bio.SeqIO functions if you want to work directly with the gapped sequences). 

 

See also the Bio.Nexus module (which this code calls internally), 

as this offers more than just accessing the alignment or its 

sequences as SeqRecord objects. 

""" 

 

from __future__ import print_function 

 

from Bio.SeqRecord import SeqRecord 

from Bio.Nexus import Nexus 

from Bio.Align import MultipleSeqAlignment 

from .Interfaces import AlignmentWriter 

from Bio import Alphabet 

 

#You can get a couple of example files here: 

#http://www.molecularevolution.org/resources/fileformats/ 

 

 

#This is a generator function! 

def NexusIterator(handle, seq_count=None): 

    """Returns SeqRecord objects from a Nexus file. 

 

    Thus uses the Bio.Nexus module to do the hard work. 

 

    You are expected to call this function via Bio.SeqIO or Bio.AlignIO 

    (and not use it directly). 

 

    NOTE - We only expect ONE alignment matrix per Nexus file, 

    meaning this iterator will only yield one MultipleSeqAlignment. 

    """ 

    n = Nexus.Nexus(handle) 

    if not n.matrix: 

        #No alignment found 

        raise StopIteration 

 

    #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 

    #The original names and the modified names are kept in these two lists: 

    assert len(n.unaltered_taxlabels) == len(n.taxlabels) 

 

    if seq_count and seq_count != len(n.unaltered_taxlabels): 

        raise ValueError("Found %i sequences, but seq_count=%i" 

               % (len(n.unaltered_taxlabels), seq_count)) 

 

    #ToDo - Can we extract any annotation too? 

    records = (SeqRecord(n.matrix[new_name], id=new_name, 

                         name=old_name, description="") 

               for old_name, new_name 

               in zip(n.unaltered_taxlabels, n.taxlabels)) 

    #All done 

    yield MultipleSeqAlignment(records, n.alphabet) 

 

 

class NexusWriter(AlignmentWriter): 

    """Nexus alignment writer. 

 

    Note that Nexus files are only expected to hold ONE alignment 

    matrix. 

 

    You are expected to call this class via the Bio.AlignIO.write() or 

    Bio.SeqIO.write() functions. 

    """ 

    def write_file(self, alignments): 

        """Use this to write an entire file containing the given alignments. 

 

        alignments - A list or iterator returning MultipleSeqAlignment objects. 

                     This should hold ONE and only one alignment. 

        """ 

        align_iter = iter(alignments)  # Could have been a list 

        try: 

            first_alignment = next(align_iter) 

        except StopIteration: 

            first_alignment = None 

        if first_alignment is None: 

            #Nothing to write! 

            return 0 

 

        #Check there is only one alignment... 

        try: 

            second_alignment = next(align_iter) 

        except StopIteration: 

            second_alignment = None 

        if second_alignment is not None: 

            raise ValueError("We can only write one Alignment to a Nexus file.") 

 

        #Good.  Actually write the single alignment, 

        self.write_alignment(first_alignment) 

        return 1  # we only support writing one alignment! 

 

    def write_alignment(self, alignment): 

        #Creates an empty Nexus object, adds the sequences, 

        #and then gets Nexus to prepare the output. 

        if len(alignment) == 0: 

            raise ValueError("Must have at least one sequence") 

        columns = alignment.get_alignment_length() 

        if columns == 0: 

            raise ValueError("Non-empty sequences are required") 

        minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 

                         + "format datatype=%s; end;"  \ 

                         % self._classify_alphabet_for_nexus(alignment._alphabet) 

        n = Nexus.Nexus(minimal_record) 

        n.alphabet = alignment._alphabet 

        for record in alignment: 

            n.add_sequence(record.id, str(record.seq)) 

        #For smaller alignments, don't bother to interleave. 

        #For larger alginments, interleave to avoid very long lines 

        #in the output - something MrBayes can't handle. 

        #TODO - Default to always interleaving? 

        n.write_nexus_data(self.handle, interleave=(columns > 1000)) 

 

    def _classify_alphabet_for_nexus(self, alphabet): 

        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 

 

        Raises an exception if this is not possible.""" 

        #Get the base alphabet (underneath any Gapped or StopCodon encoding) 

        a = Alphabet._get_base_alphabet(alphabet) 

 

        if not isinstance(a, Alphabet.Alphabet): 

            raise TypeError("Invalid alphabet") 

        elif isinstance(a, Alphabet.ProteinAlphabet): 

            return "protein" 

        elif isinstance(a, Alphabet.DNAAlphabet): 

            return "dna" 

        elif isinstance(a, Alphabet.RNAAlphabet): 

            return "rna" 

        else: 

            #Must be something like NucleotideAlphabet or 

            #just the generic Alphabet (default for fasta files) 

            raise ValueError("Need a DNA, RNA or Protein alphabet") 

 

if __name__ == "__main__": 

    from Bio._py3k import StringIO 

    print("Quick self test") 

    print("") 

    print("Repeated names without a TAXA block") 

    handle = StringIO("""#NEXUS 

    [TITLE: NoName] 

 

    begin data; 

    dimensions ntax=4 nchar=50; 

    format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 

 

    matrix 

    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ----  

    ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG  

    CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 

    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 

    ; 

    end;  

    """) 

    for a in NexusIterator(handle): 

        print(a) 

        for r in a: 

            print("%r %s %s" % (r.seq, r.name, r.id)) 

    print("Done") 

 

    print("") 

    print("Repeated names with a TAXA block") 

    handle = StringIO("""#NEXUS 

    [TITLE: NoName] 

 

    begin taxa 

    CYS1_DICDI 

    ALEU_HORVU 

    CATH_HUMAN 

    CYS1_DICDI; 

    end; 

 

    begin data; 

    dimensions ntax=4 nchar=50; 

    format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 

 

    matrix 

    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ----  

    ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG  

    CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 

    CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 

    ; 

    end;  

    """) 

    for a in NexusIterator(handle): 

        print(a) 

        for r in a: 

            print("%r %s %s" % (r.seq, r.name, r.id)) 

    print("Done") 

    print("") 

    print("Reading an empty file") 

    assert 0 == len(list(NexusIterator(StringIO()))) 

    print("Done") 

    print("") 

    print("Writing...") 

 

    handle = StringIO() 

    NexusWriter(handle).write_file([a]) 

    handle.seek(0) 

    print(handle.read()) 

 

    handle = StringIO() 

    try: 

        NexusWriter(handle).write_file([a, a]) 

        assert False, "Should have rejected more than one alignment!" 

    except ValueError: 

        pass