Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

# Copyright 2004 by Cymon J. Cox and Frank Kauff.  All rights reserved. 

# Copyright 2008 by Michiel de Hoon.  All rights reserved. 

# Revisions copyright 2009 by Cymon J. Cox.  All rights reserved. 

# Revisions copyright 2009 by Peter Cock.  All rights reserved. 

# 

# This code is part of the Biopython distribution and governed by its 

# license.  Please see the LICENSE file that should have been included 

# as part of this package. 

""" 

Parser for PHD files output by PHRED and used by PHRAP and CONSED. 

 

This module can be used directly which will return Record objects 

which should contain all the original data in the file. 

 

Alternatively, using Bio.SeqIO with the "phd" format will call this module 

internally.  This will give SeqRecord objects for each contig sequence. 

""" 

 

from Bio import Seq 

from Bio.Alphabet import generic_dna 

 

CKEYWORDS = ['CHROMAT_FILE', 'ABI_THUMBPRINT', 'PHRED_VERSION', 'CALL_METHOD', 

        'QUALITY_LEVELS', 'TIME', 'TRACE_ARRAY_MIN_INDEX', 'TRACE_ARRAY_MAX_INDEX', 

        'TRIM', 'TRACE_PEAK_AREA_RATIO', 'CHEM', 'DYE'] 

 

 

class Record(object): 

    """Hold information from a PHD file.""" 

    def __init__(self): 

        self.file_name = '' 

        self.comments = {} 

        for kw in CKEYWORDS: 

            self.comments[kw.lower()] = None 

        self.sites = [] 

        self.seq = '' 

        self.seq_trimmed = '' 

 

 

def read(handle): 

    """Reads the next PHD record from the file, returning it as a Record object. 

 

    This function reads PHD file data line by line from the handle, 

    and returns a single Record object. 

    """ 

    for line in handle: 

        if line.startswith("BEGIN_SEQUENCE"): 

            record = Record() 

            record.file_name = line[15:].rstrip() 

            break 

    else: 

        return  # No record found 

 

    for line in handle: 

        if line.startswith("BEGIN_COMMENT"): 

            break 

    else: 

        raise ValueError("Failed to find BEGIN_COMMENT line") 

 

    for line in handle: 

        line = line.strip() 

        if not line: 

            continue 

        if line == "END_COMMENT": 

            break 

        keyword, value = line.split(":", 1) 

        keyword = keyword.lower() 

        value = value.strip() 

        if keyword in ('chromat_file', 

                       'phred_version', 

                       'call_method', 

                       'chem', 

                       'dye', 

                       'time', 

                       'basecaller_version', 

                       'trace_processor_version'): 

            record.comments[keyword] = value 

        elif keyword in ('abi_thumbprint', 

                         'quality_levels', 

                         'trace_array_min_index', 

                         'trace_array_max_index'): 

            record.comments[keyword] = int(value) 

        elif keyword == 'trace_peak_area_ratio': 

            record.comments[keyword] = float(value) 

        elif keyword == 'trim': 

            first, last, prob = value.split() 

            record.comments[keyword] = (int(first), int(last), float(prob)) 

    else: 

        raise ValueError("Failed to find END_COMMENT line") 

 

    for line in handle: 

        if line.startswith('BEGIN_DNA'): 

            break 

    else: 

        raise ValueError("Failed to find BEGIN_DNA line") 

 

    for line in handle: 

        if line.startswith('END_DNA'): 

            break 

        else: 

            # Line is: "site quality peak_location" 

            # Peak location is optional according to 

            # David Gordon (the Consed author) 

            parts = line.split() 

            if len(parts) in [2, 3]: 

                record.sites.append(tuple(parts)) 

            else: 

                raise ValueError("DNA line must contain a base and quality " 

                                 "score, and optionally a peak location.") 

 

    for line in handle: 

        if line.startswith("END_SEQUENCE"): 

            break 

    else: 

        raise ValueError("Failed to find END_SEQUENCE line") 

 

    record.seq = Seq.Seq(''.join(n[0] for n in record.sites), generic_dna) 

    if record.comments['trim'] is not None: 

        first, last = record.comments['trim'][:2] 

        record.seq_trimmed = record.seq[first:last] 

 

    return record 

 

 

def parse(handle): 

    """Iterates over a file returning multiple PHD records. 

 

    The data is read line by line from the handle. The handle can be a list 

    of lines, an open file, or similar; the only requirement is that we can 

    iterate over the handle to retrieve lines from it. 

 

    Typical usage: 

 

    records = parse(handle) 

    for record in records: 

        # do something with the record object 

    """ 

    while True: 

        record = read(handle) 

        if not record: 

            return 

        yield record