pychemstation.utils.chemstation
File parser for Chemstation files (*.ch) Basically a port of the matlab script at: https://github.com/chemplexity/chromatography/blob/master/Development/File%20Conversion/ImportAgilentFID.m
This file is a standalone file to parse the binary files created by Chemstation
I use it for file with version 130, genereted by an Agilent LC.
1#!/usr/bin/python 2# coding: utf-8 3 4""" 5File parser for Chemstation files (*.ch) 6Basically a port of the matlab script at: 7https://github.com/chemplexity/chromatography/blob/master/Development/File%20Conversion/ImportAgilentFID.m 8 9This file is a standalone file to parse the binary files created by Chemstation 10 11I use it for file with version 130, genereted by an Agilent LC. 12""" 13 14import struct 15from struct import unpack 16import numpy as np 17 18# Constants used for binary file parsing 19ENDIAN = ">" 20STRING = ENDIAN + "{}s" 21UINT8 = ENDIAN + "B" 22UINT16 = ENDIAN + "H" 23INT16 = ENDIAN + "h" 24INT32 = ENDIAN + "i" 25UINT32 = ENDIAN + "I" 26 27 28def fread(fid, nelements, dtype): 29 30 """Equivalent to Matlab fread function""" 31 32 if dtype is str: 33 dt = np.uint8 # WARNING: assuming 8-bit ASCII for np.str! 34 else: 35 dt = dtype 36 37 data_array = np.fromfile(fid, dt, nelements) 38 data_array.shape = (nelements, 1) 39 40 return data_array 41 42 43def parse_utf16_string(file_, encoding="UTF16"): 44 45 """Parse a pascal type UTF16 encoded string from a binary file object""" 46 47 # First read the expected number of CHARACTERS 48 string_length = unpack(UINT8, file_.read(1))[0] 49 # Then read and decode 50 parsed = unpack(STRING.format(2 * string_length), file_.read(2 * string_length)) 51 return parsed[0].decode(encoding) 52 53 54class cached_property(object): 55 56 """A property that is only computed once per instance and then replaces 57 itself with an ordinary attribute. Deleting the attribute resets the 58 property. 59 60 https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76 61 """ 62 63 def __init__(self, func): 64 self.__doc__ = getattr(func, "__doc__") 65 self.func = func 66 67 def __get__(self, obj, cls): 68 if obj is None: 69 return self 70 value = obj.__dict__[self.func.__name__] = self.func(obj) 71 return value 72 73 74class CHFile(object): 75 76 """Class that implementats the Agilent .ch file format version 77 130. Warning: Not all aspects of the file header is understood, 78 so there may and probably is information that is not parsed. See 79 _parse_header_status for an overview of which parts of the header 80 is understood. 81 82 Attributes: 83 values (numpy.array): The internsity values (y-value) or the 84 spectrum. The unit for the values is given in `metadata['units']` 85 86 metadata (dict): The extracted metadata 87 88 filepath (str): The filepath this object was loaded from 89 """ 90 91 # Fields is a table of name, offset and type. Types 'x-time' and 'utf16' 92 # are specially handled, the rest are format arguments for struct unpack 93 fields = ( 94 ("sequence_line_or_injection", 252, UINT16), 95 ("injection_or_sequence_line", 256, UINT16), 96 ("data_offset", 264, UINT32), 97 ("start_time", 282, "x-time"), 98 ("end_time", 286, "x-time"), 99 ("version_string", 326, "utf16"), 100 ("description", 347, "utf16"), 101 ("sample", 858, "utf16"), 102 ("operator", 1880, "utf16"), 103 ("date", 2391, "utf16"), 104 ("inlet", 2492, "utf16"), 105 ("instrument", 2533, "utf16"), 106 ("method", 2574, "utf16"), 107 ("software version", 3601, "utf16"), 108 ("software name", 3089, "utf16"), 109 ("software revision", 3802, "utf16"), 110 ("zero", 4110, INT32), 111 ("units", 4172, "utf16"), 112 ("detector", 4213, "utf16"), 113 ("yscaling", 4732, ENDIAN + "d"), 114 ) 115 116 # The start position of the data 117 # Get it from metadata['data_offset'] * 512 118 data_start = 6144 119 120 # The versions of the file format supported by this implementation 121 supported_versions = {130} 122 123 def __init__(self, filepath): 124 125 self.filepath = filepath 126 self.metadata = {} 127 with open(self.filepath, "rb") as file_: 128 self._parse_header(file_) 129 self.values = self._parse_data(file_) 130 131 def _parse_header(self, file_): 132 133 """Parse the header""" 134 135 # Parse and check version 136 length = unpack(UINT8, file_.read(1))[0] 137 parsed = unpack(STRING.format(length), file_.read(length)) 138 version = int(parsed[0]) 139 if version not in self.supported_versions: 140 raise ValueError("Unsupported file version {}".format(version)) 141 self.metadata["magic_number_version"] = version 142 143 # Parse all metadata fields 144 for name, offset, type_ in self.fields: 145 file_.seek(offset) 146 if type_ == "utf16": 147 self.metadata[name] = parse_utf16_string(file_) 148 elif type_ == "x-time": 149 self.metadata[name] = unpack(UINT32, file_.read(4))[0] / 60000 150 else: 151 self.metadata[name] = unpack(type_, file_.read(struct.calcsize(type_)))[ 152 0 153 ] 154 155 def _parse_header_status(self): 156 157 """Print known and unknown parts of the header""" 158 159 file_ = open(self.filepath, "rb") 160 161 print("Header parsing status") 162 # Map positions to fields for all the known fields 163 knowns = {item[1]: item for item in self.fields} 164 # A couple of places has a \x01 byte before a string, these we simply 165 # skip 166 skips = {325, 3600} 167 # Jump to after the magic number version 168 file_.seek(4) 169 170 # Initialize variables for unknown bytes 171 unknown_start = None 172 unknown_bytes = b"" 173 # While we have not yet reached the data 174 while file_.tell() < self.data_start: 175 current_position = file_.tell() 176 # Just continue on skip bytes 177 if current_position in skips: 178 file_.read(1) 179 continue 180 181 # If we know about a data field that starts at this point 182 if current_position in knowns: 183 # If we have collected unknown bytes, print them out and reset 184 if unknown_bytes != b"": 185 print( 186 "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00")) 187 ) 188 unknown_bytes = b"" 189 unknown_start = None 190 191 # Print out the position, type, name and value of the known 192 # value 193 print("Known field at {: >4},".format(current_position), end=" ") 194 name, _, type_ = knowns[current_position] 195 if type_ == "x-time": 196 print( 197 'x-time, "{: <19}'.format(name + '"'), 198 unpack(ENDIAN + "f", file_.read(4))[0] / 60000, 199 ) 200 elif type_ == "utf16": 201 print( 202 ' utf16, "{: <19}'.format(name + '"'), parse_utf16_string(file_) 203 ) 204 else: 205 size = struct.calcsize(type_) 206 print( 207 '{: >6}, "{: <19}'.format(type_, name + '"'), 208 unpack(type_, file_.read(size))[0], 209 ) 210 211 # We do not know about a data field at this position If we have 212 # already collected 4 zero bytes, assume that we are done with 213 # this unkonw field, print and reset 214 else: 215 if unknown_bytes[-4:] == b"\x00\x00\x00\x00": 216 print( 217 "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00")) 218 ) 219 unknown_bytes = b"" 220 unknown_start = None 221 222 # Read one byte and save it 223 one_byte = file_.read(1) 224 if unknown_bytes == b"": 225 # Only start a new collection of unknown bytes, if this 226 # byte is not a zero byte 227 if one_byte != b"\x00": 228 unknown_bytes = one_byte 229 unknown_start = file_.tell() - 1 230 else: 231 unknown_bytes += one_byte 232 233 file_.close() 234 235 def _parse_data(self, file_): 236 237 """Parse the data. Decompress the delta-encoded data, and scale them 238 with y-scaling""" 239 240 scaling = self.metadata["yscaling"] 241 242 # Go to the end of the file 243 file_.seek(0, 2) 244 stop = file_.tell() 245 246 # Go to the start point of the data 247 file_.seek(self.data_start) 248 249 signal = [] 250 251 buff = [0, 0, 0, 0] 252 253 while file_.tell() < stop: 254 255 buff[0] = fread(file_, 1, INT16)[0][0] 256 buff[1] = buff[3] 257 258 if buff[0] << 12 == 0: 259 break 260 261 for i in range(buff[0] & 4095): 262 263 buff[2] = fread(file_, 1, INT16)[0][0] 264 265 if buff[2] != -32768: 266 buff[1] = buff[1] + buff[2] 267 else: 268 buff[1] = fread(file_, 1, INT32)[0][0] 269 270 signal.append(buff[1]) 271 272 buff[3] = buff[1] 273 274 signal = np.array(signal) 275 signal = signal * scaling 276 277 return signal 278 279 @cached_property 280 def times(self): 281 282 """The time values (x-value) for the data set in minutes""" 283 284 return np.linspace( 285 self.metadata["start_time"], self.metadata["end_time"], len(self.values) 286 ) 287 288 289if __name__ == "__main__": 290 CHFile("lcdiag.reg")
29def fread(fid, nelements, dtype): 30 31 """Equivalent to Matlab fread function""" 32 33 if dtype is str: 34 dt = np.uint8 # WARNING: assuming 8-bit ASCII for np.str! 35 else: 36 dt = dtype 37 38 data_array = np.fromfile(fid, dt, nelements) 39 data_array.shape = (nelements, 1) 40 41 return data_array
Equivalent to Matlab fread function
44def parse_utf16_string(file_, encoding="UTF16"): 45 46 """Parse a pascal type UTF16 encoded string from a binary file object""" 47 48 # First read the expected number of CHARACTERS 49 string_length = unpack(UINT8, file_.read(1))[0] 50 # Then read and decode 51 parsed = unpack(STRING.format(2 * string_length), file_.read(2 * string_length)) 52 return parsed[0].decode(encoding)
Parse a pascal type UTF16 encoded string from a binary file object
55class cached_property(object): 56 57 """A property that is only computed once per instance and then replaces 58 itself with an ordinary attribute. Deleting the attribute resets the 59 property. 60 61 https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76 62 """ 63 64 def __init__(self, func): 65 self.__doc__ = getattr(func, "__doc__") 66 self.func = func 67 68 def __get__(self, obj, cls): 69 if obj is None: 70 return self 71 value = obj.__dict__[self.func.__name__] = self.func(obj) 72 return value
A property that is only computed once per instance and then replaces itself with an ordinary attribute. Deleting the attribute resets the property.
https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76
75class CHFile(object): 76 77 """Class that implementats the Agilent .ch file format version 78 130. Warning: Not all aspects of the file header is understood, 79 so there may and probably is information that is not parsed. See 80 _parse_header_status for an overview of which parts of the header 81 is understood. 82 83 Attributes: 84 values (numpy.array): The internsity values (y-value) or the 85 spectrum. The unit for the values is given in `metadata['units']` 86 87 metadata (dict): The extracted metadata 88 89 filepath (str): The filepath this object was loaded from 90 """ 91 92 # Fields is a table of name, offset and type. Types 'x-time' and 'utf16' 93 # are specially handled, the rest are format arguments for struct unpack 94 fields = ( 95 ("sequence_line_or_injection", 252, UINT16), 96 ("injection_or_sequence_line", 256, UINT16), 97 ("data_offset", 264, UINT32), 98 ("start_time", 282, "x-time"), 99 ("end_time", 286, "x-time"), 100 ("version_string", 326, "utf16"), 101 ("description", 347, "utf16"), 102 ("sample", 858, "utf16"), 103 ("operator", 1880, "utf16"), 104 ("date", 2391, "utf16"), 105 ("inlet", 2492, "utf16"), 106 ("instrument", 2533, "utf16"), 107 ("method", 2574, "utf16"), 108 ("software version", 3601, "utf16"), 109 ("software name", 3089, "utf16"), 110 ("software revision", 3802, "utf16"), 111 ("zero", 4110, INT32), 112 ("units", 4172, "utf16"), 113 ("detector", 4213, "utf16"), 114 ("yscaling", 4732, ENDIAN + "d"), 115 ) 116 117 # The start position of the data 118 # Get it from metadata['data_offset'] * 512 119 data_start = 6144 120 121 # The versions of the file format supported by this implementation 122 supported_versions = {130} 123 124 def __init__(self, filepath): 125 126 self.filepath = filepath 127 self.metadata = {} 128 with open(self.filepath, "rb") as file_: 129 self._parse_header(file_) 130 self.values = self._parse_data(file_) 131 132 def _parse_header(self, file_): 133 134 """Parse the header""" 135 136 # Parse and check version 137 length = unpack(UINT8, file_.read(1))[0] 138 parsed = unpack(STRING.format(length), file_.read(length)) 139 version = int(parsed[0]) 140 if version not in self.supported_versions: 141 raise ValueError("Unsupported file version {}".format(version)) 142 self.metadata["magic_number_version"] = version 143 144 # Parse all metadata fields 145 for name, offset, type_ in self.fields: 146 file_.seek(offset) 147 if type_ == "utf16": 148 self.metadata[name] = parse_utf16_string(file_) 149 elif type_ == "x-time": 150 self.metadata[name] = unpack(UINT32, file_.read(4))[0] / 60000 151 else: 152 self.metadata[name] = unpack(type_, file_.read(struct.calcsize(type_)))[ 153 0 154 ] 155 156 def _parse_header_status(self): 157 158 """Print known and unknown parts of the header""" 159 160 file_ = open(self.filepath, "rb") 161 162 print("Header parsing status") 163 # Map positions to fields for all the known fields 164 knowns = {item[1]: item for item in self.fields} 165 # A couple of places has a \x01 byte before a string, these we simply 166 # skip 167 skips = {325, 3600} 168 # Jump to after the magic number version 169 file_.seek(4) 170 171 # Initialize variables for unknown bytes 172 unknown_start = None 173 unknown_bytes = b"" 174 # While we have not yet reached the data 175 while file_.tell() < self.data_start: 176 current_position = file_.tell() 177 # Just continue on skip bytes 178 if current_position in skips: 179 file_.read(1) 180 continue 181 182 # If we know about a data field that starts at this point 183 if current_position in knowns: 184 # If we have collected unknown bytes, print them out and reset 185 if unknown_bytes != b"": 186 print( 187 "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00")) 188 ) 189 unknown_bytes = b"" 190 unknown_start = None 191 192 # Print out the position, type, name and value of the known 193 # value 194 print("Known field at {: >4},".format(current_position), end=" ") 195 name, _, type_ = knowns[current_position] 196 if type_ == "x-time": 197 print( 198 'x-time, "{: <19}'.format(name + '"'), 199 unpack(ENDIAN + "f", file_.read(4))[0] / 60000, 200 ) 201 elif type_ == "utf16": 202 print( 203 ' utf16, "{: <19}'.format(name + '"'), parse_utf16_string(file_) 204 ) 205 else: 206 size = struct.calcsize(type_) 207 print( 208 '{: >6}, "{: <19}'.format(type_, name + '"'), 209 unpack(type_, file_.read(size))[0], 210 ) 211 212 # We do not know about a data field at this position If we have 213 # already collected 4 zero bytes, assume that we are done with 214 # this unkonw field, print and reset 215 else: 216 if unknown_bytes[-4:] == b"\x00\x00\x00\x00": 217 print( 218 "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00")) 219 ) 220 unknown_bytes = b"" 221 unknown_start = None 222 223 # Read one byte and save it 224 one_byte = file_.read(1) 225 if unknown_bytes == b"": 226 # Only start a new collection of unknown bytes, if this 227 # byte is not a zero byte 228 if one_byte != b"\x00": 229 unknown_bytes = one_byte 230 unknown_start = file_.tell() - 1 231 else: 232 unknown_bytes += one_byte 233 234 file_.close() 235 236 def _parse_data(self, file_): 237 238 """Parse the data. Decompress the delta-encoded data, and scale them 239 with y-scaling""" 240 241 scaling = self.metadata["yscaling"] 242 243 # Go to the end of the file 244 file_.seek(0, 2) 245 stop = file_.tell() 246 247 # Go to the start point of the data 248 file_.seek(self.data_start) 249 250 signal = [] 251 252 buff = [0, 0, 0, 0] 253 254 while file_.tell() < stop: 255 256 buff[0] = fread(file_, 1, INT16)[0][0] 257 buff[1] = buff[3] 258 259 if buff[0] << 12 == 0: 260 break 261 262 for i in range(buff[0] & 4095): 263 264 buff[2] = fread(file_, 1, INT16)[0][0] 265 266 if buff[2] != -32768: 267 buff[1] = buff[1] + buff[2] 268 else: 269 buff[1] = fread(file_, 1, INT32)[0][0] 270 271 signal.append(buff[1]) 272 273 buff[3] = buff[1] 274 275 signal = np.array(signal) 276 signal = signal * scaling 277 278 return signal 279 280 @cached_property 281 def times(self): 282 283 """The time values (x-value) for the data set in minutes""" 284 285 return np.linspace( 286 self.metadata["start_time"], self.metadata["end_time"], len(self.values) 287 )
Class that implementats the Agilent .ch file format version
- Warning: Not all aspects of the file header is understood, so there may and probably is information that is not parsed. See _parse_header_status for an overview of which parts of the header is understood.
Attributes:
values (numpy.array): The internsity values (y-value) or the
spectrum. The unit for the values is given in metadata['units']
metadata (dict): The extracted metadata
filepath (str): The filepath this object was loaded from