Package tfasta :: Module _tfasta
[hide private]
[frames] | no frames]

Source Code for Module tfasta._tfasta

  1  #! /usr/bin/env python 
  2   
  3  """ 
  4  tfasta: reading and writing of fast files 
  5   
  6  This is automatically generated documentation and should 
  7  not be relied on for the API. Please 
  8  see the official documentation at http://pythonhosted.org/tfasta/. 
  9   
 10  The most useful functions are: 
 11   
 12       - B{L{fasta_parser}}: returns an iterator for a fasta file 
 13       - B{L{string_fasta_parser}}: returns an iterator for fasta text 
 14       - B{L{io_fasta_parser}}: returns an iterator for fasta text 
 15       - B{L{make_fasta_from_dict}}: returns a string representation 
 16               of a fasta file given a C{dict} of sequences 
 17               keyed by record name 
 18       - B{L{make_fasta}}: returns a string representation of a 
 19               fasta record given a sequence (as a C{str}) and 
 20               a name (also as a C{str}) 
 21   
 22  See L{tfasta_templates} documentation for supported fasta file types. 
 23   
 24  @var FASTA_WIDTH: default width of fasta sequences 
 25  @type FASTA_WIDTH: int 
 26  """ 
 27   
 28  import re 
 29  import sys 
 30  import cStringIO 
 31  from tfasta_templates import FastaTemplate, TEMPLATES 
 32   
 33  T_DEF = TEMPLATES['default'] 
 34  T_SWISS = TEMPLATES['swissprot'] 
 35  T_PDB = TEMPLATES['pdb'] 
 36  T_NR = TEMPLATES['nr'] 
 37  T_NRBLAST = TEMPLATES['nrblast'] 
 38   
 39  FASTA_WIDTH = 60 
 40   
 41  CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-" 
 42   
 43  ####################################################################### 
 44  # io_fasta_parser() 
 45  ####################################################################### 
46 -def io_fasta_parser(fastafile, template=None, dogaps=False):
47 """ 48 Helper generator function for L{fasta_parser} and 49 L{string_fasta_parser}. 50 51 Given I{fastafile} (C{file}-like object, open for reading), 52 returns an iterator that iterates over the fasta file. 53 It will C{yield} dictionaries keyed according 54 to the C{fields} in C{template}. These dictionaries will all also 55 include a sequence keyed by "sequence". 56 57 @param fastafile: C{file}-like object containing fasta text, 58 opened for reading 59 @param template: instance of C{FastaTemplate} class--choose from 60 TEMPLATES or define your own. 61 @type template: FastaTemplate 62 @param dogaps: a C{bool} specifying whether to keep "-" in the 63 sequence after parsing the file 64 - if C{False}, then gaps are ignored 65 - handy if processing an alignment 66 """ 67 letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 68 69 # set the default template if necessary 70 if (template is None): 71 template = T_DEF 72 # gaps 73 if dogaps: 74 alphabet = letters + "-" 75 else: 76 alphabet = letters 77 # set the flag to tells us if we have found a fasta entry 78 found_first = False 79 fasta = [] 80 # main loop 81 while (True): 82 aline = fastafile.readline() 83 # see if end of file 84 if (not aline): 85 if (found_first): 86 yield entry 87 break 88 # see if the line matches the supplied regex 89 fasta_match = template.match(aline) 90 if (fasta_match): 91 if (found_first): 92 yield entry 93 else: 94 found_first = True 95 # start a new entry because we found a match 96 entry = {"sequence" : ""} 97 # fill the hash described in template 98 index = 0 # which key (field) we are on 99 # iterate over all of the fields 100 while (index < len(template.fields)): 101 # match group 0 is entire match, so must add 1 102 entry[template.get_field(index)] = fasta_match.group(index+1) 103 index = index + 1 104 else: 105 # probably a sequence line 106 if (found_first): 107 seqline = "".join([c.upper() for c in aline if c.upper() in alphabet]) 108 # it is a sequence line because we must be in a record 109 entry["sequence"] = entry["sequence"] + seqline 110 else: 111 # file not true fasta format, keep going to find first record 112 pass 113 # clean up 114 fastafile.close()
115 116 117 ####################################################################### 118 # fasta_parser() 119 #######################################################################
120 -def fasta_parser(filename,template=None,greedy=None,dogaps=False):
121 """ 122 Given a I{filename}, returns an iterator that iterates 123 over the fasta file. It will C{yield} dictionaries keyed according 124 to the C{fields} in C{template}. These dictionaries will all also 125 include a sequence keyed by "sequence". Yielding dictionaries 126 allows for flexibility in the types of fasta files parsed. 127 128 File format testing is not done, so make sure its a fasta file. 129 130 @param filename: name of the fasta file 131 @type filename: str 132 @param template: instance of C{FastaTemplate} class--choose from 133 TEMPLATES or define your own. 134 @type template: FastaTemplate 135 @param greedy: a C{bool} specifying whether to read the 136 whole fasta file in at once. Set to C{True} for many smaller 137 files or to C{False} for a few or one REALLY big ones. 138 @type greedy: bool 139 @param dogaps: a C{bool} specifying whether to keep "-" in the 140 sequence after parsing the file 141 - if C{False}, then gaps are ignored 142 - handy if processing an alignment 143 """ 144 # be safe if greediness is not specified 145 if (greedy is None): 146 greedy = False 147 # open the fasta file 148 fastafile = open(filename) 149 # flag to for finding the first record 150 if (greedy): 151 # read the whole file and make a switch, no-one the wiser 152 afile = cStringIO.StringIO(fastafile.read()) 153 fastafile.close() 154 fastafile = afile 155 return io_fasta_parser(fastafile, template, dogaps)
156 157 ####################################################################### 158 # parse_string_fasta() 159 #######################################################################
160 -def string_fasta_parser(astr, template=None, dogaps=False):
161 """ 162 Given I{astr} (string of fasta), returns an iterator that iterates 163 over the fasta file. It will C{yield} dictionaries keyed according 164 to the C{fields} in C{template}. These dictionaries will all also 165 include a sequence keyed by "sequence". Yielding dictionaries 166 allows for flexibility in the types of fasta files parsed. 167 168 This function will do its best to remove unneeded whitespace, 169 including line breaks. 170 171 Beyond simple extra whitespace, the `astr` should be properly 172 formatted fasta text. 173 174 @param astr: fasta text 175 @type astr: str 176 @param template: instance of C{FastaTemplate} class--choose from 177 TEMPLATES or define your own. 178 @type template: FastaTemplate 179 @param dogaps: a C{bool} specifying whether to keep "-" in the 180 sequence after parsing the file 181 - if C{False}, then gaps are ignored 182 - handy if processing an alignment 183 """ 184 astr = "\n".join([s.strip() for s in astr.splitlines() if s.strip()]) 185 fastafile = cStringIO.StringIO(astr) 186 return io_fasta_parser(fastafile, template, dogaps)
187 188 ################################################################### 189 # make_fasta_from_dict() 190 ###################################################################
191 -def make_fasta_from_dict(adict, width=FASTA_WIDTH):
192 """ 193 Give it a C{dict} of sequences keyed by name of the sequence 194 and it returns a fasta representation as a C{str}. 195 196 @param adict: C{dict} of sequences keyed by name 197 @type adict: dict 198 199 @return: fasta representation of sequences as a C{str} 200 @rtype: str 201 """ 202 fastastr = "" 203 names = adict.keys() 204 names.sort() 205 for aname in names: 206 aseq = adict[aname] 207 fastastr = "\n".join([fastastr, make_fasta(aname, aseq, width)]) 208 return fastastr
209 210 211 ################################################################### 212 # make_fasta() 213 ###################################################################
214 -def make_fasta(name, seq, width=FASTA_WIDTH):
215 """ 216 Give it a sequence I{name} and a sequence (I{seq}) and it 217 returns a fasta representation as a C{str}. 218 219 @param name: name of sequence 220 @type name: str 221 @param seq: sequence as a C{str} 222 @type seq: str 223 224 @return: a string representation of a fasta record 225 @rtype: str 226 """ 227 seq = "".join([c for c in seq if c in CHARS]) 228 seq = seq.upper() 229 place = 0 230 fastaseq = ">" + str(name) 231 while place < len(seq): 232 char = seq[place] 233 if (place % width) == 0: 234 fastaseq = fastaseq + "\n" 235 fastaseq = fastaseq + char 236 place = place + 1 237 fastaseq = fastaseq 238 return fastaseq
239 240 241 242 ####################################################################### 243 # test_parser 244 #######################################################################
245 -def test_parser(template, filename):
246 """ 247 Tests for proper construction of a parser using I{template} or 248 of the fasta file named I{filename}. 249 250 @param template: C{FastaTemplate} describing fasta record 251 @type template: FastaTemplate 252 @param filename: name of fasta file 253 @type filename: str 254 255 @raise Exception: raises C{Exception} if fasta file is malformed 256 or if the I{template} didn't work 257 258 @return: C{True} if it works. 259 @rtype: bool 260 """ 261 newfasRE = re.compile(r'>') 262 263 afile = open(filename) 264 for aline in afile: 265 if newfasRE.search(aline): 266 if not template.match(aline): 267 afile.close() 268 raise Exception, "%s\n\nmalformed fasta file: '%s'" % \ 269 (aline, filename) 270 afile.close() 271 return True
272