Coverage for src/pchemdb/crc.py: 100%

97 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-19 09:48 -0700

1"""Utilities for parsing CRC data. 

2 

3Example: Parse CRC CSV into data structure 

4 

5>>> from csv import DictReader 

6>>> from pchemdb.crc import parse_crc 

7>>> with Path(...).open(mode="r", encoding=...) as file: 

8... reader = DictReader(file) 

9... data = [] 

10... for row in reader: 

11... data.extend(parse_crc) 

12""" 

13 

14from importlib.resources import files 

15import json 

16import logging 

17import re 

18from typing import Any 

19from typing import NamedTuple 

20 

21from pint import Quantity 

22from pyEQL import ureg 

23 

24from pchemdb.utils import formula_to_salt 

25 

26logger = logging.getLogger(__name__) 

27 

28formula_re = re.compile(r"(?P<coeff>\d+/\d+)?(?P<formula>.+)") 

29# aqueous HBr/HCl 

30molar_cond_temp_re = re.compile( 

31 "<i>\u039b</i>/" 

32 r"S cm<sup>2</sup> mol<sup>-1</sup><br/>(?:(?P<sign>.)?(?P<temp>\d+.+C))" 

33) 

34# aqueous hydro-halogen acids 

35molar_cond_conc_re1 = re.compile( 

36 "<i>\u039b</i>/" 

37 r"S cm<sup>2</sup> mol<sup>-1</sup><br/>(?P<conc>\d+\.\d+)" 

38) 

39# aqueous electrolytes 

40molar_cond_conc_re2 = re.compile( 

41 "<i>\u039b</i>" 

42 r"<sup></sup>\((?P<conc>\d+\.\d+) M\)/S cm<sup>2 </sup>mol<sup>-1</sup>" 

43) 

44cond_conc_re = re.compile("<i>\u03ba</i>" r"\((?P<conc>\d+(\.\d+)?)%\)") 

45activity_conc_re = re.compile("<i>\u03b3</i>" r"\((?P<conc>\d+\.\d+) m\)") 

46xml_tags_re = re.compile(r"<(/)?[^>]+>") 

47DEFAULT_TEMPERATURE = "298.15 K" 

48_CONDUCTIVITY_CONC_KEY = "<i>c</i>/M" 

49_CONDUCTIVITY_UNITS = "S/m" 

50_CONCENTRATION_UNITS = "mol/L" 

51_TEMPERATURE_UNITS = "K" 

52DB_FILE = "crc.json" 

53 

54 

55class _ParseResult(NamedTuple): 

56 prop: str 

57 conc: Quantity 

58 temp: Quantity 

59 value: Quantity 

60 

61 

62def _parse_temperature_dependent_molar_conductivity( 

63 d: dict[str, str], factor: float, temp: str, v: str 

64) -> _ParseResult: 

65 conc_units = _CONCENTRATION_UNITS 

66 prop = "conductivity" 

67 prop_units = "S cm ** 2 /mol" 

68 

69 conc_mag = float(d[_CONDUCTIVITY_CONC_KEY]) 

70 conc = ureg.Quantity(conc_mag, conc_units) * factor 

71 value = conc * ureg.Quantity(float(v), prop_units) 

72 

73 return _ParseResult( 

74 prop=prop, 

75 conc=conc, 

76 temp=ureg.Quantity(temp), 

77 value=value.to(_CONDUCTIVITY_UNITS), 

78 ) 

79 

80 

81def _parse_concentration_dependent_molar_conductivity( 

82 factor: float, conc_mag: float, v: str 

83) -> _ParseResult: 

84 conc_units = _CONCENTRATION_UNITS 

85 prop = "conductivity" 

86 prop_units = "S cm ** 2 /mol" 

87 

88 temp = DEFAULT_TEMPERATURE 

89 conc = ureg.Quantity(conc_mag, conc_units) * factor 

90 value = conc * ureg.Quantity(float(v), prop_units) 

91 

92 return _ParseResult( 

93 prop=prop, 

94 conc=conc, 

95 temp=ureg.Quantity(temp), 

96 value=value.to(_CONDUCTIVITY_UNITS), 

97 ) 

98 

99 

100def _parse_concentration_dependent_conductivity( 

101 factor: float, target_conc: float, v: str 

102) -> _ParseResult: 

103 conc_units = _CONCENTRATION_UNITS 

104 prop = "conductivity" 

105 prop_units = "mS / cm" 

106 

107 temp = "20 degC" 

108 # Store concentration not as true weight percent, but as wt% of 

109 # solution required to be added to achieve reported wt% 

110 # This is required because when adding solutes based on weight 

111 # percent, pyEQL adds the solute in an amount equal to the weight 

112 # percent instead of adding the amount of solute required to for 

113 # the solute to attain the specified weight percent 

114 conc_mag = target_conc / (100 - target_conc) 

115 conc_units = "%" 

116 conc = ureg.Quantity(conc_mag, conc_units) * factor 

117 value = conc * ureg.Quantity(float(v), prop_units) 

118 

119 return _ParseResult( 

120 prop=prop, 

121 conc=conc, 

122 temp=ureg.Quantity(temp), 

123 value=value.to(_CONDUCTIVITY_UNITS), 

124 ) 

125 

126 

127def _parse_mean_activity_coefficient( 

128 factor: float, conc_mag: float, v: str 

129) -> _ParseResult: 

130 conc_units = "mol/kg" 

131 prop = "mean_activity_coefficient" 

132 prop_units = "dimensionless" 

133 

134 temp = DEFAULT_TEMPERATURE 

135 conc = ureg.Quantity(conc_mag, conc_units) * factor 

136 value = ureg.Quantity(float(v), prop_units) 

137 

138 return _ParseResult( 

139 prop=prop, 

140 conc=conc, 

141 temp=ureg.Quantity(temp), 

142 value=value, 

143 ) 

144 

145 

146def parse_crc( 

147 d: dict[str, Any], 

148) -> list[ 

149 tuple[ 

150 dict[str, Any], dict[str, list[tuple[str, str]]], list[tuple[str, str]] 

151 ] 

152]: 

153 """Parse data from CRC. 

154 

155 Args: 

156 d: A dictionary corresponding to a row in a CRC .csv file. 

157 

158 Returns: 

159 A list of 3-tuples (``solution``, ``solute_data``, ``solution_data``), 

160 where each item represents a property entry. ``solution`` is a 

161 dictionary mapping :class:`pyEQL.solution.Solution` constructor 

162 parameter names to their values. ``solute_data`` is a dictionary mapping 

163 solutes formulae to list of property-value pairs. ``solution_data`` is a 

164 list of property-value pairs. 

165 """ 

166 dataset: list[ 

167 tuple[ 

168 dict[str, Any], 

169 dict[str, list[tuple[str, str]]], 

170 list[tuple[str, str]], 

171 ] 

172 ] = [] 

173 compound = str(d.get("Mol. form.", d.get("Compound"))) 

174 solution = xml_tags_re.sub("", compound) 

175 match = formula_re.search(solution) 

176 

177 if not match: 

178 msg = f"Unable to parse formula: {solution}" 

179 raise ValueError(msg) 

180 

181 num, denom = (match.group("coeff") or "1/1").split("/") 

182 factor = int(num) / int(denom) 

183 formula = match.group("formula") 

184 salt = formula_to_salt(formula) 

185 

186 for k, v in d.items(): 

187 if k is None or not v: 

188 continue 

189 

190 # If data key in temperature-dependent dataset, 

191 # read concentration from "<i>c,<\i>/M" key 

192 # read temperature from cond_temp_re 

193 if match := molar_cond_temp_re.search(k): 

194 res = _parse_temperature_dependent_molar_conductivity( 

195 d, factor, match.group("temp"), v 

196 ) 

197 

198 # If data key in concentration-dependent dataset, 

199 # read concentration from conc_re (activity or conductivity) 

200 # use dataset temperature 

201 elif (conc_match := molar_cond_conc_re1.search(k)) or ( 

202 conc_match := molar_cond_conc_re2.search(k) 

203 ): 

204 res = _parse_concentration_dependent_molar_conductivity( 

205 factor, float(conc_match.group("conc")), v 

206 ) 

207 elif conc_match := cond_conc_re.search(k): 

208 res = _parse_concentration_dependent_conductivity( 

209 factor, float(conc_match.group("conc")), v 

210 ) 

211 elif conc_match := activity_conc_re.search(k): 

212 res = _parse_mean_activity_coefficient( 

213 factor, float(conc_match.group("conc")), v 

214 ) 

215 else: 

216 continue 

217 

218 solutes = { 

219 salt.cation: f"{res.conc.m * salt.nu_cation} {res.conc.units}", 

220 salt.anion: f"{res.conc.m * salt.nu_anion} {res.conc.units}", 

221 } 

222 soln = { 

223 "solutes": solutes, 

224 "temperature": str(res.temp.to(_TEMPERATURE_UNITS)), 

225 } 

226 solute_data: dict[str, list[tuple[str, str]]] = {} 

227 soln_data = [(res.prop, f"{res.value.m} {res.value.units}")] 

228 entry = (soln, solute_data, soln_data) 

229 dataset.append(entry) 

230 

231 return dataset 

232 

233 

234def load_crc_database() -> list[ 

235 tuple[dict[str, str], dict[str, list[str]], list[str]] 

236]: 

237 """Load the CRC database.""" 

238 json_db_file = files("pchemdb").joinpath("_database", DB_FILE) 

239 

240 with json_db_file.open(mode="r", encoding="utf-8") as file: 

241 return json.load(file)