pychemstation.utils.chemstation

File parser for Chemstation files (*.ch) Basically a port of the matlab script at: https://github.com/chemplexity/chromatography/blob/master/Development/File%20Conversion/ImportAgilentFID.m

This file is a standalone file to parse the binary files created by Chemstation

I use it for file with version 130, genereted by an Agilent LC.

  1#!/usr/bin/python
  2# coding: utf-8
  3
  4"""
  5File parser for Chemstation files (*.ch)
  6Basically a port of the matlab script at:
  7https://github.com/chemplexity/chromatography/blob/master/Development/File%20Conversion/ImportAgilentFID.m
  8
  9This file is a standalone file to parse the binary files created by Chemstation
 10
 11I use it for file with version 130, genereted by an Agilent LC.
 12"""
 13
 14import struct
 15from struct import unpack
 16import numpy as np
 17
 18# Constants used for binary file parsing
 19ENDIAN = ">"
 20STRING = ENDIAN + "{}s"
 21UINT8 = ENDIAN + "B"
 22UINT16 = ENDIAN + "H"
 23INT16 = ENDIAN + "h"
 24INT32 = ENDIAN + "i"
 25UINT32 = ENDIAN + "I"
 26
 27
 28def fread(fid, nelements, dtype):
 29
 30    """Equivalent to Matlab fread function"""
 31
 32    if dtype is str:
 33        dt = np.uint8  # WARNING: assuming 8-bit ASCII for np.str!
 34    else:
 35        dt = dtype
 36
 37    data_array = np.fromfile(fid, dt, nelements)
 38    data_array.shape = (nelements, 1)
 39
 40    return data_array
 41
 42
 43def parse_utf16_string(file_, encoding="UTF16"):
 44
 45    """Parse a pascal type UTF16 encoded string from a binary file object"""
 46
 47    # First read the expected number of CHARACTERS
 48    string_length = unpack(UINT8, file_.read(1))[0]
 49    # Then read and decode
 50    parsed = unpack(STRING.format(2 * string_length), file_.read(2 * string_length))
 51    return parsed[0].decode(encoding)
 52
 53
 54class cached_property(object):
 55
 56    """A property that is only computed once per instance and then replaces
 57    itself with an ordinary attribute. Deleting the attribute resets the
 58    property.
 59
 60    https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76
 61    """
 62
 63    def __init__(self, func):
 64        self.__doc__ = getattr(func, "__doc__")
 65        self.func = func
 66
 67    def __get__(self, obj, cls):
 68        if obj is None:
 69            return self
 70        value = obj.__dict__[self.func.__name__] = self.func(obj)
 71        return value
 72
 73
 74class CHFile(object):
 75
 76    """Class that implementats the Agilent .ch file format version
 77    130. Warning: Not all aspects of the file header is understood,
 78    so there may and probably is information that is not parsed. See
 79    _parse_header_status for an overview of which parts of the header
 80    is understood.
 81
 82    Attributes:
 83        values (numpy.array): The internsity values (y-value) or the
 84        spectrum. The unit for the values is given in `metadata['units']`
 85
 86        metadata (dict): The extracted metadata
 87
 88        filepath (str): The filepath this object was loaded from
 89    """
 90
 91    # Fields is a table of name, offset and type. Types 'x-time' and 'utf16'
 92    # are specially handled, the rest are format arguments for struct unpack
 93    fields = (
 94        ("sequence_line_or_injection", 252, UINT16),
 95        ("injection_or_sequence_line", 256, UINT16),
 96        ("data_offset", 264, UINT32),
 97        ("start_time", 282, "x-time"),
 98        ("end_time", 286, "x-time"),
 99        ("version_string", 326, "utf16"),
100        ("description", 347, "utf16"),
101        ("sample", 858, "utf16"),
102        ("operator", 1880, "utf16"),
103        ("date", 2391, "utf16"),
104        ("inlet", 2492, "utf16"),
105        ("instrument", 2533, "utf16"),
106        ("method", 2574, "utf16"),
107        ("software version", 3601, "utf16"),
108        ("software name", 3089, "utf16"),
109        ("software revision", 3802, "utf16"),
110        ("zero", 4110, INT32),
111        ("units", 4172, "utf16"),
112        ("detector", 4213, "utf16"),
113        ("yscaling", 4732, ENDIAN + "d"),
114    )
115
116    # The start position of the data
117    # Get it from metadata['data_offset'] * 512
118    data_start = 6144
119
120    # The versions of the file format supported by this implementation
121    supported_versions = {130}
122
123    def __init__(self, filepath):
124
125        self.filepath = filepath
126        self.metadata = {}
127        with open(self.filepath, "rb") as file_:
128            self._parse_header(file_)
129            self.values = self._parse_data(file_)
130
131    def _parse_header(self, file_):
132
133        """Parse the header"""
134
135        # Parse and check version
136        length = unpack(UINT8, file_.read(1))[0]
137        parsed = unpack(STRING.format(length), file_.read(length))
138        version = int(parsed[0])
139        if version not in self.supported_versions:
140            raise ValueError("Unsupported file version {}".format(version))
141        self.metadata["magic_number_version"] = version
142
143        # Parse all metadata fields
144        for name, offset, type_ in self.fields:
145            file_.seek(offset)
146            if type_ == "utf16":
147                self.metadata[name] = parse_utf16_string(file_)
148            elif type_ == "x-time":
149                self.metadata[name] = unpack(UINT32, file_.read(4))[0] / 60000
150            else:
151                self.metadata[name] = unpack(type_, file_.read(struct.calcsize(type_)))[
152                    0
153                ]
154
155    def _parse_header_status(self):
156
157        """Print known and unknown parts of the header"""
158
159        file_ = open(self.filepath, "rb")
160
161        print("Header parsing status")
162        # Map positions to fields for all the known fields
163        knowns = {item[1]: item for item in self.fields}
164        # A couple of places has a \x01 byte before a string, these we simply
165        # skip
166        skips = {325, 3600}
167        # Jump to after the magic number version
168        file_.seek(4)
169
170        # Initialize variables for unknown bytes
171        unknown_start = None
172        unknown_bytes = b""
173        # While we have not yet reached the data
174        while file_.tell() < self.data_start:
175            current_position = file_.tell()
176            # Just continue on skip bytes
177            if current_position in skips:
178                file_.read(1)
179                continue
180
181            # If we know about a data field that starts at this point
182            if current_position in knowns:
183                # If we have collected unknown bytes, print them out and reset
184                if unknown_bytes != b"":
185                    print(
186                        "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00"))
187                    )
188                    unknown_bytes = b""
189                    unknown_start = None
190
191                # Print out the position, type, name and value of the known
192                # value
193                print("Known field at {: >4},".format(current_position), end=" ")
194                name, _, type_ = knowns[current_position]
195                if type_ == "x-time":
196                    print(
197                        'x-time, "{: <19}'.format(name + '"'),
198                        unpack(ENDIAN + "f", file_.read(4))[0] / 60000,
199                    )
200                elif type_ == "utf16":
201                    print(
202                        ' utf16, "{: <19}'.format(name + '"'), parse_utf16_string(file_)
203                    )
204                else:
205                    size = struct.calcsize(type_)
206                    print(
207                        '{: >6}, "{: <19}'.format(type_, name + '"'),
208                        unpack(type_, file_.read(size))[0],
209                    )
210
211            # We do not know about a data field at this position If we have
212            # already collected 4 zero bytes, assume that we are done with
213            # this unkonw field, print and reset
214            else:
215                if unknown_bytes[-4:] == b"\x00\x00\x00\x00":
216                    print(
217                        "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00"))
218                    )
219                    unknown_bytes = b""
220                    unknown_start = None
221
222                # Read one byte and save it
223                one_byte = file_.read(1)
224                if unknown_bytes == b"":
225                    # Only start a new collection of unknown bytes, if this
226                    # byte is not a zero byte
227                    if one_byte != b"\x00":
228                        unknown_bytes = one_byte
229                        unknown_start = file_.tell() - 1
230                else:
231                    unknown_bytes += one_byte
232
233        file_.close()
234
235    def _parse_data(self, file_):
236
237        """Parse the data. Decompress the delta-encoded data, and scale them
238        with y-scaling"""
239
240        scaling = self.metadata["yscaling"]
241
242        # Go to the end of the file
243        file_.seek(0, 2)
244        stop = file_.tell()
245
246        # Go to the start point of the data
247        file_.seek(self.data_start)
248
249        signal = []
250
251        buff = [0, 0, 0, 0]
252
253        while file_.tell() < stop:
254
255            buff[0] = fread(file_, 1, INT16)[0][0]
256            buff[1] = buff[3]
257
258            if buff[0] << 12 == 0:
259                break
260
261            for i in range(buff[0] & 4095):
262
263                buff[2] = fread(file_, 1, INT16)[0][0]
264
265                if buff[2] != -32768:
266                    buff[1] = buff[1] + buff[2]
267                else:
268                    buff[1] = fread(file_, 1, INT32)[0][0]
269
270                signal.append(buff[1])
271
272            buff[3] = buff[1]
273
274        signal = np.array(signal)
275        signal = signal * scaling
276
277        return signal
278
279    @cached_property
280    def times(self):
281
282        """The time values (x-value) for the data set in minutes"""
283
284        return np.linspace(
285            self.metadata["start_time"], self.metadata["end_time"], len(self.values)
286        )
287
288
289if __name__ == "__main__":
290    CHFile("lcdiag.reg")
ENDIAN = '>'
STRING = '>{}s'
UINT8 = '>B'
UINT16 = '>H'
INT16 = '>h'
INT32 = '>i'
UINT32 = '>I'
def fread(fid, nelements, dtype):
29def fread(fid, nelements, dtype):
30
31    """Equivalent to Matlab fread function"""
32
33    if dtype is str:
34        dt = np.uint8  # WARNING: assuming 8-bit ASCII for np.str!
35    else:
36        dt = dtype
37
38    data_array = np.fromfile(fid, dt, nelements)
39    data_array.shape = (nelements, 1)
40
41    return data_array

Equivalent to Matlab fread function

def parse_utf16_string(file_, encoding='UTF16'):
44def parse_utf16_string(file_, encoding="UTF16"):
45
46    """Parse a pascal type UTF16 encoded string from a binary file object"""
47
48    # First read the expected number of CHARACTERS
49    string_length = unpack(UINT8, file_.read(1))[0]
50    # Then read and decode
51    parsed = unpack(STRING.format(2 * string_length), file_.read(2 * string_length))
52    return parsed[0].decode(encoding)

Parse a pascal type UTF16 encoded string from a binary file object

class cached_property:
55class cached_property(object):
56
57    """A property that is only computed once per instance and then replaces
58    itself with an ordinary attribute. Deleting the attribute resets the
59    property.
60
61    https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76
62    """
63
64    def __init__(self, func):
65        self.__doc__ = getattr(func, "__doc__")
66        self.func = func
67
68    def __get__(self, obj, cls):
69        if obj is None:
70            return self
71        value = obj.__dict__[self.func.__name__] = self.func(obj)
72        return value

A property that is only computed once per instance and then replaces itself with an ordinary attribute. Deleting the attribute resets the property.

https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76

cached_property(func)
64    def __init__(self, func):
65        self.__doc__ = getattr(func, "__doc__")
66        self.func = func
func
class CHFile:
 75class CHFile(object):
 76
 77    """Class that implementats the Agilent .ch file format version
 78    130. Warning: Not all aspects of the file header is understood,
 79    so there may and probably is information that is not parsed. See
 80    _parse_header_status for an overview of which parts of the header
 81    is understood.
 82
 83    Attributes:
 84        values (numpy.array): The internsity values (y-value) or the
 85        spectrum. The unit for the values is given in `metadata['units']`
 86
 87        metadata (dict): The extracted metadata
 88
 89        filepath (str): The filepath this object was loaded from
 90    """
 91
 92    # Fields is a table of name, offset and type. Types 'x-time' and 'utf16'
 93    # are specially handled, the rest are format arguments for struct unpack
 94    fields = (
 95        ("sequence_line_or_injection", 252, UINT16),
 96        ("injection_or_sequence_line", 256, UINT16),
 97        ("data_offset", 264, UINT32),
 98        ("start_time", 282, "x-time"),
 99        ("end_time", 286, "x-time"),
100        ("version_string", 326, "utf16"),
101        ("description", 347, "utf16"),
102        ("sample", 858, "utf16"),
103        ("operator", 1880, "utf16"),
104        ("date", 2391, "utf16"),
105        ("inlet", 2492, "utf16"),
106        ("instrument", 2533, "utf16"),
107        ("method", 2574, "utf16"),
108        ("software version", 3601, "utf16"),
109        ("software name", 3089, "utf16"),
110        ("software revision", 3802, "utf16"),
111        ("zero", 4110, INT32),
112        ("units", 4172, "utf16"),
113        ("detector", 4213, "utf16"),
114        ("yscaling", 4732, ENDIAN + "d"),
115    )
116
117    # The start position of the data
118    # Get it from metadata['data_offset'] * 512
119    data_start = 6144
120
121    # The versions of the file format supported by this implementation
122    supported_versions = {130}
123
124    def __init__(self, filepath):
125
126        self.filepath = filepath
127        self.metadata = {}
128        with open(self.filepath, "rb") as file_:
129            self._parse_header(file_)
130            self.values = self._parse_data(file_)
131
132    def _parse_header(self, file_):
133
134        """Parse the header"""
135
136        # Parse and check version
137        length = unpack(UINT8, file_.read(1))[0]
138        parsed = unpack(STRING.format(length), file_.read(length))
139        version = int(parsed[0])
140        if version not in self.supported_versions:
141            raise ValueError("Unsupported file version {}".format(version))
142        self.metadata["magic_number_version"] = version
143
144        # Parse all metadata fields
145        for name, offset, type_ in self.fields:
146            file_.seek(offset)
147            if type_ == "utf16":
148                self.metadata[name] = parse_utf16_string(file_)
149            elif type_ == "x-time":
150                self.metadata[name] = unpack(UINT32, file_.read(4))[0] / 60000
151            else:
152                self.metadata[name] = unpack(type_, file_.read(struct.calcsize(type_)))[
153                    0
154                ]
155
156    def _parse_header_status(self):
157
158        """Print known and unknown parts of the header"""
159
160        file_ = open(self.filepath, "rb")
161
162        print("Header parsing status")
163        # Map positions to fields for all the known fields
164        knowns = {item[1]: item for item in self.fields}
165        # A couple of places has a \x01 byte before a string, these we simply
166        # skip
167        skips = {325, 3600}
168        # Jump to after the magic number version
169        file_.seek(4)
170
171        # Initialize variables for unknown bytes
172        unknown_start = None
173        unknown_bytes = b""
174        # While we have not yet reached the data
175        while file_.tell() < self.data_start:
176            current_position = file_.tell()
177            # Just continue on skip bytes
178            if current_position in skips:
179                file_.read(1)
180                continue
181
182            # If we know about a data field that starts at this point
183            if current_position in knowns:
184                # If we have collected unknown bytes, print them out and reset
185                if unknown_bytes != b"":
186                    print(
187                        "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00"))
188                    )
189                    unknown_bytes = b""
190                    unknown_start = None
191
192                # Print out the position, type, name and value of the known
193                # value
194                print("Known field at {: >4},".format(current_position), end=" ")
195                name, _, type_ = knowns[current_position]
196                if type_ == "x-time":
197                    print(
198                        'x-time, "{: <19}'.format(name + '"'),
199                        unpack(ENDIAN + "f", file_.read(4))[0] / 60000,
200                    )
201                elif type_ == "utf16":
202                    print(
203                        ' utf16, "{: <19}'.format(name + '"'), parse_utf16_string(file_)
204                    )
205                else:
206                    size = struct.calcsize(type_)
207                    print(
208                        '{: >6}, "{: <19}'.format(type_, name + '"'),
209                        unpack(type_, file_.read(size))[0],
210                    )
211
212            # We do not know about a data field at this position If we have
213            # already collected 4 zero bytes, assume that we are done with
214            # this unkonw field, print and reset
215            else:
216                if unknown_bytes[-4:] == b"\x00\x00\x00\x00":
217                    print(
218                        "Unknown at", unknown_start, repr(unknown_bytes.rstrip(b"\x00"))
219                    )
220                    unknown_bytes = b""
221                    unknown_start = None
222
223                # Read one byte and save it
224                one_byte = file_.read(1)
225                if unknown_bytes == b"":
226                    # Only start a new collection of unknown bytes, if this
227                    # byte is not a zero byte
228                    if one_byte != b"\x00":
229                        unknown_bytes = one_byte
230                        unknown_start = file_.tell() - 1
231                else:
232                    unknown_bytes += one_byte
233
234        file_.close()
235
236    def _parse_data(self, file_):
237
238        """Parse the data. Decompress the delta-encoded data, and scale them
239        with y-scaling"""
240
241        scaling = self.metadata["yscaling"]
242
243        # Go to the end of the file
244        file_.seek(0, 2)
245        stop = file_.tell()
246
247        # Go to the start point of the data
248        file_.seek(self.data_start)
249
250        signal = []
251
252        buff = [0, 0, 0, 0]
253
254        while file_.tell() < stop:
255
256            buff[0] = fread(file_, 1, INT16)[0][0]
257            buff[1] = buff[3]
258
259            if buff[0] << 12 == 0:
260                break
261
262            for i in range(buff[0] & 4095):
263
264                buff[2] = fread(file_, 1, INT16)[0][0]
265
266                if buff[2] != -32768:
267                    buff[1] = buff[1] + buff[2]
268                else:
269                    buff[1] = fread(file_, 1, INT32)[0][0]
270
271                signal.append(buff[1])
272
273            buff[3] = buff[1]
274
275        signal = np.array(signal)
276        signal = signal * scaling
277
278        return signal
279
280    @cached_property
281    def times(self):
282
283        """The time values (x-value) for the data set in minutes"""
284
285        return np.linspace(
286            self.metadata["start_time"], self.metadata["end_time"], len(self.values)
287        )

Class that implementats the Agilent .ch file format version

  1. Warning: Not all aspects of the file header is understood, so there may and probably is information that is not parsed. See _parse_header_status for an overview of which parts of the header is understood.

Attributes: values (numpy.array): The internsity values (y-value) or the spectrum. The unit for the values is given in metadata['units']

metadata (dict): The extracted metadata

filepath (str): The filepath this object was loaded from
CHFile(filepath)
124    def __init__(self, filepath):
125
126        self.filepath = filepath
127        self.metadata = {}
128        with open(self.filepath, "rb") as file_:
129            self._parse_header(file_)
130            self.values = self._parse_data(file_)
fields = (('sequence_line_or_injection', 252, '>H'), ('injection_or_sequence_line', 256, '>H'), ('data_offset', 264, '>I'), ('start_time', 282, 'x-time'), ('end_time', 286, 'x-time'), ('version_string', 326, 'utf16'), ('description', 347, 'utf16'), ('sample', 858, 'utf16'), ('operator', 1880, 'utf16'), ('date', 2391, 'utf16'), ('inlet', 2492, 'utf16'), ('instrument', 2533, 'utf16'), ('method', 2574, 'utf16'), ('software version', 3601, 'utf16'), ('software name', 3089, 'utf16'), ('software revision', 3802, 'utf16'), ('zero', 4110, '>i'), ('units', 4172, 'utf16'), ('detector', 4213, 'utf16'), ('yscaling', 4732, '>d'))
data_start = 6144
supported_versions = {130}
filepath
metadata
def times(unknown):

The time values (x-value) for the data set in minutes