csvpath.managers.results.results_manager
1# pylint: disable=C0114 2import os 3from pathlib import Path 4import datetime 5import dateutil.parser 6from typing import Dict, List, Any 7from csvpath.util.line_spooler import LineSpooler 8from csvpath.util.exceptions import InputException, CsvPathsException 9from csvpath.util.reference_parser import ReferenceParser 10from csvpath.util.file_readers import DataFileReader 11from csvpath.util.file_writers import DataFileWriter 12from csvpath.util.nos import Nos 13from csvpath.scanning.scanner import Scanner 14from ..run.run_metadata import RunMetadata 15from ..run.run_registrar import RunRegistrar 16from .results_metadata import ResultsMetadata 17from .result_metadata import ResultMetadata 18from .results_registrar import ResultsRegistrar 19from .result_registrar import ResultRegistrar 20from .result_serializer import ResultSerializer 21from .result import Result 22from .result_file_reader import ResultFileReader 23 24 25class ResultsManager: # pylint: disable=C0115 26 def __init__(self, *, csvpaths=None): 27 """@private""" 28 self.named_results = {} 29 """@private""" 30 self._csvpaths = None 31 # use property 32 self.csvpaths = csvpaths 33 """@private""" 34 35 @property 36 def csvpaths(self): 37 """@private""" 38 return self._csvpaths 39 40 @csvpaths.setter 41 def csvpaths(self, cs) -> None: # noqa: F821 42 """@private""" 43 self._csvpaths = cs 44 45 def complete_run(self, *, run_dir, pathsname, results) -> None: 46 """@private""" 47 rr = ResultsRegistrar( 48 csvpaths=self.csvpaths, 49 run_dir=run_dir, 50 pathsname=pathsname, 51 results=results, 52 ) 53 m = rr.manifest 54 mdata = ResultsMetadata(self.csvpaths.config) 55 if "time" not in m or m["time"] is None: 56 mdata.set_time() 57 else: 58 mdata.time_string = m["time"] 59 mdata.uuid_string = m["uuid"] 60 mdata.archive_name = self.csvpaths.config.archive_name 61 mdata.named_file_fingerprint = m["named_file_fingerprint"] 62 mdata.named_file_fingerprint_on_file = m["named_file_fingerprint_on_file"] 63 mdata.named_file_name = m["named_file_name"] 64 mdata.named_file_path = m["named_file_path"] 65 mdata.run_home = run_dir 66 mdata.named_paths_name = pathsname 67 mdata.named_results_name = pathsname 68 rr.register_complete(mdata) 69 70 def start_run(self, *, run_dir, pathsname, filename) -> None: 71 """@private""" 72 rr = ResultsRegistrar( 73 csvpaths=self.csvpaths, 74 run_dir=run_dir, 75 pathsname=pathsname, 76 ) 77 mdata = ResultsMetadata(self.csvpaths.config) 78 mdata.archive_name = self.csvpaths.config.archive_name 79 mdata.named_file_name = filename 80 mdata.run_home = run_dir 81 mdata.named_paths_name = pathsname 82 mdata.named_results_name = pathsname 83 rr.register_start(mdata) 84 85 def get_metadata(self, name: str) -> Dict[str, Any]: 86 """@private 87 gets the run metadata. will include the metadata complete from 88 the first results. however, the metadata for individual results must 89 come direct from them in order to not overwrite""" 90 results = self.get_named_results(name) 91 meta = {} 92 if results and len(results) > 0: 93 rs = results[0] 94 path = rs.csvpath 95 meta["paths_name"] = rs.paths_name 96 meta["file_name"] = rs.file_name 97 meta["data_lines"] = path.line_monitor.data_end_line_count 98 paths = len(self.csvpaths.paths_manager.get_named_paths(name)) 99 meta["csvpaths_applied"] = paths 100 meta["csvpaths_completed"] = paths == len(results) 101 meta["valid"] = self.is_valid(name) 102 meta = {**meta, **rs.csvpath.metadata} 103 return meta 104 105 def get_specific_named_result(self, name: str, name_or_id: str) -> Result: 106 results = self.get_named_results(name) 107 if results and len(results) > 0: 108 for r in results: 109 if name_or_id == r.csvpath.identity: 110 return r 111 return None # pragma: no cover 112 113 def get_specific_named_result_manifest( 114 self, name: str, name_or_id: str 115 ) -> dict[str, str | bool]: 116 r = self.get_specific_named_result(name, name_or_id) 117 if r is None: 118 return None 119 rs = ResultSerializer(self._csvpaths.config.archive_path) 120 rr = ResultRegistrar(csvpaths=self.csvpaths, result=r, result_serializer=rs) 121 return rr.manifest 122 123 def get_last_named_result(self, *, name: str, before: str = None) -> Result: 124 results = self.get_named_results(name) 125 if results and len(results) > 0: 126 return results[len(results) - 1] 127 return None 128 129 def is_valid(self, name: str) -> bool: 130 results = self.get_named_results(name) 131 for r in results: 132 if not r.is_valid: 133 return False 134 return True 135 136 def get_variables(self, name: str) -> bool: 137 results = self.get_named_results(name) 138 vs = {} 139 for r in results: 140 vs = {**r.csvpath.variables, **vs} 141 return vs 142 143 def has_lines(self, name: str) -> bool: 144 """@private""" 145 results = self.get_named_results(name) 146 for r in results: 147 if r.lines and len(r.lines) > 0: 148 return True 149 return False 150 151 def get_number_of_results(self, name: str) -> int: 152 nr = self.get_named_results(name) 153 if nr is None: 154 return 0 155 return len(nr) 156 157 def has_errors(self, name: str) -> bool: 158 results = self.get_named_results(name) 159 for r in results: 160 if r.has_errors(): 161 return True 162 return False 163 164 def get_number_of_errors(self, name: str) -> bool: 165 results = self.get_named_results(name) 166 errors = 0 167 for r in results: 168 errors += r.errors_count() 169 return errors 170 171 def add_named_result(self, result: Result) -> None: 172 """@private""" 173 if result.file_name is None: 174 raise InputException("Results must have a named-file name") 175 if result.paths_name is None: 176 raise InputException("Results must have a named-paths name") 177 name = result.paths_name 178 if name not in self.named_results: 179 self.named_results[name] = [result] 180 else: 181 self.named_results[name].append(result) 182 self._variables = None 183 # 184 # this is the beginning of an identity run within a named-paths run. 185 # run metadata goes to the central record of runs kicking off within 186 # the archive. the run's own more complete record is below as a 187 # separate event. this could change, but atm seems reasonable. 188 # 189 190 mdata = RunMetadata(self.csvpaths.config) 191 mdata.uuid = result.uuid 192 mdata.archive_name = self.csvpaths.config.archive_name 193 mdata.archive_path = self.csvpaths.config.archive_path 194 mdata.time_start = result.run_time 195 mdata.run_home = result.run_dir 196 mdata.identity = result.identity_or_index 197 mdata.named_paths_name = result.paths_name 198 mdata.named_file_name = result.file_name 199 rr = RunRegistrar(self.csvpaths) 200 rr.register_start(mdata) 201 202 # 203 # we prep the results event 204 # 205 # we use the same UUID for both metadata updates because the 206 # UUID represents the run, not the metadata object 207 # 208 209 mdata = ResultMetadata(self.csvpaths.config) 210 mdata.uuid = result.uuid 211 mdata.archive_name = self.csvpaths.config.archive_name 212 mdata.time_started = result.run_time 213 mdata.named_results_name = result.paths_name 214 sep = Nos(result.run_dir).sep 215 mdata.run = result.run_dir[result.run_dir.rfind(sep) + 1 :] 216 mdata.run_home = result.run_dir 217 mdata.instance_home = result.instance_dir 218 mdata.instance_identity = result.identity_or_index 219 mdata.input_data_file = result.file_name 220 rs = ResultSerializer(self._csvpaths.config.archive_path) 221 rr = ResultRegistrar( 222 csvpaths=self.csvpaths, result=result, result_serializer=rs 223 ) 224 rr.register_start(mdata) 225 226 def set_named_results(self, results: Dict[str, List[Result]]) -> None: 227 """@private""" 228 self.named_results = {} 229 for value in results.values(): 230 self.add_named_results(value) 231 232 def add_named_results(self, results: List[Result]) -> None: 233 """@private""" 234 for r in results: 235 self.add_named_result(r) 236 237 def list_named_results(self) -> list[str]: 238 path = self._csvpaths.config.archive_path 239 if Nos(path).dir_exists(): 240 names = Nos(path).listdir() 241 names = [n for n in names if not n.startswith(".")] 242 names.sort() 243 else: 244 self._csvpaths.logger.warning( 245 "Archive %s does not exist. If no runs have been attempted yet this is fine.", 246 path, 247 ) 248 names = [] 249 return names 250 251 def do_transfers_if(self, result) -> None: 252 """@private""" 253 transfers = result.csvpath.transfers 254 if transfers is None: 255 return 256 tpaths = self.transfer_paths(result) 257 self._do_transfers(tpaths) 258 259 def transfer_paths(self, result) -> list[tuple[str, str, str, str]]: 260 """@private""" 261 # 262 # 1: filename, no extension needed: data | unmatched 263 # 2: variable name containing the path to write to 264 # 3: path of source file 265 # 3: path to write to 266 # 267 transfers = result.csvpath.transfers 268 tpaths = [] 269 for t in transfers: 270 filefrom = "data.csv" if t[0].startswith("data") else "unmatched.csv" 271 varname = t[1] 272 pathfrom = self._path_to_result(result, filefrom) 273 pathto = self._path_to_transfer_to(result, varname) 274 tpaths.append((filefrom, varname, pathfrom, pathto)) 275 return tpaths 276 277 def _do_transfers(self, tpaths) -> None: 278 """@private""" 279 for t in tpaths: 280 pathfrom = t[2] 281 pathto = t[3] 282 with DataFileReader(pathfrom) as pf: 283 with DataFileWriter(path=pathto, mode="w") as pt: 284 pt.write(pf.read()) 285 286 def _path_to_transfer_to(self, result, t) -> str: 287 """@private""" 288 p = result.csvpath.config.transfer_root 289 if t not in result.csvpath.variables: 290 raise InputException(f"Variable {t} not found in variables") 291 f = result.csvpath.variables[t] 292 if f.find("..") != -1: 293 raise InputException("Transfer path cannot include '..': {f}") 294 rp = os.path.join(p, f) 295 sep = Nos(rp).sep 296 rd = rp[0 : rp.rfind(sep)] 297 if not Nos(rd).exists(): 298 Nos(rd).makedir() 299 return rp 300 301 def _path_to_result(self, result, t) -> str: 302 """@private""" 303 d = result.instance_dir 304 o = os.path.join(d, t) 305 sep = Nos(o).sep 306 r = o[0 : o.rfind(sep)] 307 if not Nos(r).exists(): 308 Nos(r).makedirs() 309 Nos(r).makedir() 310 return o 311 312 def save(self, result: Result) -> None: 313 """@private""" 314 # 315 # at this time we're not holding on to the result. 316 # we have a place for that, but for now not holding 317 # forces the deserialization to work completely, so 318 # it is worth more than the minor speed up of caching. 319 # 320 if self._csvpaths is None: 321 raise CsvPathsException("Cannot save because there is no CsvPaths instance") 322 if result.lines and isinstance(result.lines, LineSpooler): 323 # we are done spooling. need to close whatever may be open. 324 result.lines.close() 325 # cannot make lines None w/o recreating lines. now we're setting 326 # closed to true to indicate that we've written. 327 # we don't need the serializer trying to save spooled lines 328 # result.lines = None 329 # 330 # if we are doing a transfer(s) do it here so we can put metadata in about 331 # the copy before the metadata is serialized into the results. 332 # 333 self.do_transfers_if(result) 334 rs = ResultSerializer(self._csvpaths.config.archive_path) 335 rs.save_result(result) 336 ResultRegistrar( 337 csvpaths=self.csvpaths, result=result, result_serializer=rs 338 ).register_complete() 339 340 # in this form: $group.results.2024-01-01_10-15-20.mypath 341 def data_file_for_reference(self, refstr, not_name: str = None) -> str: 342 """@private""" 343 ref = ReferenceParser(refstr) 344 if ref.datatype != ReferenceParser.RESULTS: 345 raise InputException( 346 f"Reference datatype must be {ReferenceParser.RESULTS}" 347 ) 348 namedpaths = ref.root_major 349 instance = ref.name_one 350 path = ref.name_three # not used? why? 351 name_three = ref.name_three 352 base = self._csvpaths.config.archive_path 353 filename = os.path.join(base, namedpaths) 354 if not Nos(filename).dir_exists(): 355 raise InputException( 356 f"Reference {refstr} generated {filename} path that does not point to a previously run named-paths group" 357 ) 358 # 359 # instance can have var-subs like: 360 # 2024-01-01_10-15-:last 361 # 2024-01-01_10-:first 362 # 2024-01-01_10-:0 363 # 364 instance = self._find_instance( 365 filename, instance, not_name=not_name, name_three=name_three 366 ) 367 filename = os.path.join(filename, instance) 368 if not Nos(filename).dir_exists(): 369 raise InputException( 370 f"Reference {refstr} does not point to a valid named-paths run file at {filename}" 371 ) 372 filename = os.path.join(filename, path) 373 if not Nos(filename).dir_exists(): 374 raise InputException( 375 f"Reference to {filename} does not point to a csvpath in a named-paths group run" 376 ) 377 filename = os.path.join(filename, "data.csv") 378 if not Nos(filename).exists(): 379 raise InputException( 380 "Reference does not point to a data file resulting from a named-paths group run" 381 ) 382 return filename 383 384 def _find_instance( 385 self, filename, instance, not_name: str = None, name_three: str = None 386 ) -> str: 387 """@private 388 remember that you cannot replay a replay using :last. the reason is that both 389 runs will be looking for the same assets but the last replay run will not have 390 the asset needed. in principle, we could fix this, but in practice, any magic 391 we do to make it always work is going to make the lineage more mysterious. 392 """ 393 c = instance.find(":") 394 if c == -1: 395 filename = os.path.join(filename, instance) 396 return filename 397 if not Nos(filename).dir_exists(): 398 raise InputException(f"The base dir {filename} must exist") 399 var = instance[c:] 400 instance = instance[0:c] 401 ret = None 402 if var == ":last": 403 ret = self._find_last( 404 filename, instance, not_name=not_name, name_three=name_three 405 ) 406 elif var == ":first": 407 ret = self._find_first( 408 filename, instance, not_name=not_name, name_three=name_three 409 ) 410 else: 411 raise InputException(f"Unknown reference var-sub token {var}") 412 return ret 413 414 def _find_last( 415 self, filename, instance, not_name: str = None, name_three: str = None 416 ) -> str: 417 """@private""" 418 last = True 419 return self._find( 420 filename, instance, last, not_name=not_name, name_three=name_three 421 ) 422 423 def _find_first( 424 self, filename, instance, not_name: str = None, name_three: str = None 425 ) -> str: 426 """@private""" 427 first = False 428 return self._find( 429 filename, instance, first, not_name=not_name, name_three=name_three 430 ) 431 432 def _find( 433 self, 434 filename, 435 instance, 436 last: bool = True, 437 not_name: str = None, 438 name_three: str = None, 439 ) -> str: 440 """@private""" 441 names = Nos(filename).listdir() 442 ns = [] 443 for n in names: 444 if not_name is not None and not_name.endswith(n): 445 continue 446 if n.startswith("."): 447 continue 448 # 449 # test for manifest existing here? 450 # 451 mani = os.path.join(filename, n) 452 mani = os.path.join(mani, "manifest.json") 453 if not Nos(mani).exists(): 454 continue 455 if name_three: 456 mani = os.path.join(filename, n) 457 mani = os.path.join(mani, name_three) 458 mani = os.path.join(mani, "manifest.json") 459 if not Nos(mani).exists(): 460 continue 461 ns.append(n) 462 return self._find_in_dir_names(instance, ns, last) 463 464 def _find_in_dir_names(self, instance: str, names, last: bool = True) -> str: 465 """@private""" 466 ms = "%Y-%m-%d_%H-%M-%S.%f" 467 s = "%Y-%m-%d_%H-%M-%S" 468 names = [n for n in names if n.startswith(instance)] 469 if len(names) == 0: 470 return None 471 names = sorted( 472 names, 473 key=lambda x: datetime.datetime.strptime(x, ms if x.find(".") > -1 else s), 474 ) 475 if last is True: 476 i = len(names) 477 # 478 # we drop 1 because -1 for the 0-base. note that we may find a replay 479 # run that doesn't have the asset we're looking for. that's not great 480 # but it is fine -- the rule is, no replays of replays using :last. 481 # it is on the user to set up their replay approprately. 482 # 483 i -= 1 484 if i < 0: 485 self.csvpaths.logger.error( 486 f"Previous run is at count {i} but there is no such run. Returning None." 487 ) 488 self.csvpaths.logger.info( 489 "Found previous runs: %s matching instance: %s", names, instance 490 ) 491 return None 492 ret = names[i] 493 else: 494 ret = names[0] 495 return ret 496 497 def get_run_time_str(self, name, run_time) -> str: 498 """@private""" 499 rs = ResultSerializer(self._csvpaths.config.archive_path) 500 t = rs.get_run_dir(paths_name=name, run_time=run_time) 501 return t 502 503 def remove_named_results(self, name: str) -> None: 504 """@private""" 505 # 506 # does not get rid of results on disk 507 # 508 if name in self.named_results: 509 del self.named_results[name] 510 self._variables = None 511 else: 512 self.csvpaths.logger.warning(f"Results '{name}' not found") 513 # 514 # we treat this as a recoverable error because typically the user 515 # has complete control of the csvpaths environment, making the 516 # problem config that should be addressed. 517 # 518 # if reached by a reference this error should be trapped at an 519 # expression and handled according to the error policy. 520 # 521 raise InputException(f"Results '{name}' not found") 522 523 def clean_named_results(self, name: str) -> None: 524 """@private""" 525 if name in self.named_results: 526 self.remove_named_results(name) 527 # 528 # clean from filesystem too? 529 # 530 531 def get_named_results(self, name) -> List[List[Any]]: 532 # 533 # CsvPaths instances should not be long lived. they are not servers or 534 # agents. for each new run, unless there is a reason to not create a new 535 # CsvPaths instance, we would create a new one. 536 # 537 if name in self.named_results: 538 return self.named_results[name] 539 # 540 # find and load the result, if exists. we find 541 # results home with the name. run_home is the 542 # last run dir. the results we're looking for are 543 # the instance dirs in the run dir. 544 # we'll need another method for getting a specific 545 # run, rather than the default, the last one. 546 # 547 path = os.path.join(self.csvpaths.config.archive_path, name) 548 self.csvpaths.logger.debug( 549 "Attempting to load results for %s from %s", name, path 550 ) 551 if Nos(path).dir_exists(): 552 runs = Nos(path).listdir() 553 runs.sort() 554 run = runs[len(runs) - 1] 555 rs = self.get_named_results_for_run(name=name, run=run) 556 if rs is not None: 557 return rs 558 # 559 # we treat this as a recoverable error because typically the user 560 # has complete control of the csvpaths environment, making the 561 # problem config that should be addressed. 562 # 563 # if reached by a reference this error should be trapped at an 564 # expression and handled according to the error policy. 565 # 566 msg = ( 567 f"Results '{name}' does not exist. Has has that named-paths group been run?" 568 ) 569 self.csvpaths.logger.error(msg) 570 if self.csvpaths.ecoms.do_i_raise(): 571 raise InputException(msg) 572 573 def get_named_results_for_run(self, *, name: str, run: str) -> list[list[Any]]: 574 path = os.path.join(self.csvpaths.config.archive_path, name) 575 path = os.path.join(path, run) 576 instances = Nos(path).listdir() 577 rs = [] 578 for inst in instances: 579 if inst == "manifest.json": 580 continue 581 r = self.get_named_result_for_instance( 582 name=name, run_dir=path, run=run, instance=inst 583 ) 584 rs.append(r) 585 return rs 586 587 def get_named_result_for_instance( 588 self, *, name: str, run_dir: str, run: str, instance: str 589 ) -> list[list[Any]]: 590 instance_dir = os.path.join(run_dir, instance) 591 mani = ResultFileReader.manifest(instance_dir) 592 # 593 # csvpath needs to be loaded with all meta.json->metadata and some/most of runtime_data 594 # 595 csvpath = self.csvpaths.csvpath() 596 meta = ResultFileReader.meta(instance_dir) 597 if meta: 598 # 599 # until there's a clear case for more, this is all we're going to load. 600 # for the most part, people should be using the metadata, not digging into 601 # run objects that may not be current. if we really need to recreate the 602 # csvpath perfectly we should probably go back and rethink. maybe pickle? 603 # 604 csvpath.scanner = Scanner(csvpath=csvpath) 605 csvpath.scanner.parse(meta["runtime_data"]["scan_part"]) 606 csvpath.metadata = meta["metadata"] 607 csvpath.modes.update() 608 csvpath.identity 609 csvpath.scan = meta["runtime_data"]["scan_part"] 610 csvpath.match = meta["runtime_data"]["match_part"] 611 csvpath.delimiter = meta["runtime_data"]["delimiter"] 612 csvpath.quotechar = meta["runtime_data"]["quotechar"] 613 vars = ResultFileReader.vars(instance_dir) 614 if vars: 615 csvpath.variables = vars 616 # 617 # this may not be complete. let's see if it works or needs more. 618 # 619 r = Result( 620 csvpath=csvpath, 621 paths_name=name, 622 run_dir=run_dir, 623 file_name=mani["actual_data_file"], 624 run_index=mani["instance_index"], 625 run_time=dateutil.parser.parse(mani["time"]), 626 runtime_data=meta["runtime_data"], 627 by_line=not bool(mani["serial"]), 628 ) 629 return r
class
ResultsManager:
26class ResultsManager: # pylint: disable=C0115 27 def __init__(self, *, csvpaths=None): 28 """@private""" 29 self.named_results = {} 30 """@private""" 31 self._csvpaths = None 32 # use property 33 self.csvpaths = csvpaths 34 """@private""" 35 36 @property 37 def csvpaths(self): 38 """@private""" 39 return self._csvpaths 40 41 @csvpaths.setter 42 def csvpaths(self, cs) -> None: # noqa: F821 43 """@private""" 44 self._csvpaths = cs 45 46 def complete_run(self, *, run_dir, pathsname, results) -> None: 47 """@private""" 48 rr = ResultsRegistrar( 49 csvpaths=self.csvpaths, 50 run_dir=run_dir, 51 pathsname=pathsname, 52 results=results, 53 ) 54 m = rr.manifest 55 mdata = ResultsMetadata(self.csvpaths.config) 56 if "time" not in m or m["time"] is None: 57 mdata.set_time() 58 else: 59 mdata.time_string = m["time"] 60 mdata.uuid_string = m["uuid"] 61 mdata.archive_name = self.csvpaths.config.archive_name 62 mdata.named_file_fingerprint = m["named_file_fingerprint"] 63 mdata.named_file_fingerprint_on_file = m["named_file_fingerprint_on_file"] 64 mdata.named_file_name = m["named_file_name"] 65 mdata.named_file_path = m["named_file_path"] 66 mdata.run_home = run_dir 67 mdata.named_paths_name = pathsname 68 mdata.named_results_name = pathsname 69 rr.register_complete(mdata) 70 71 def start_run(self, *, run_dir, pathsname, filename) -> None: 72 """@private""" 73 rr = ResultsRegistrar( 74 csvpaths=self.csvpaths, 75 run_dir=run_dir, 76 pathsname=pathsname, 77 ) 78 mdata = ResultsMetadata(self.csvpaths.config) 79 mdata.archive_name = self.csvpaths.config.archive_name 80 mdata.named_file_name = filename 81 mdata.run_home = run_dir 82 mdata.named_paths_name = pathsname 83 mdata.named_results_name = pathsname 84 rr.register_start(mdata) 85 86 def get_metadata(self, name: str) -> Dict[str, Any]: 87 """@private 88 gets the run metadata. will include the metadata complete from 89 the first results. however, the metadata for individual results must 90 come direct from them in order to not overwrite""" 91 results = self.get_named_results(name) 92 meta = {} 93 if results and len(results) > 0: 94 rs = results[0] 95 path = rs.csvpath 96 meta["paths_name"] = rs.paths_name 97 meta["file_name"] = rs.file_name 98 meta["data_lines"] = path.line_monitor.data_end_line_count 99 paths = len(self.csvpaths.paths_manager.get_named_paths(name)) 100 meta["csvpaths_applied"] = paths 101 meta["csvpaths_completed"] = paths == len(results) 102 meta["valid"] = self.is_valid(name) 103 meta = {**meta, **rs.csvpath.metadata} 104 return meta 105 106 def get_specific_named_result(self, name: str, name_or_id: str) -> Result: 107 results = self.get_named_results(name) 108 if results and len(results) > 0: 109 for r in results: 110 if name_or_id == r.csvpath.identity: 111 return r 112 return None # pragma: no cover 113 114 def get_specific_named_result_manifest( 115 self, name: str, name_or_id: str 116 ) -> dict[str, str | bool]: 117 r = self.get_specific_named_result(name, name_or_id) 118 if r is None: 119 return None 120 rs = ResultSerializer(self._csvpaths.config.archive_path) 121 rr = ResultRegistrar(csvpaths=self.csvpaths, result=r, result_serializer=rs) 122 return rr.manifest 123 124 def get_last_named_result(self, *, name: str, before: str = None) -> Result: 125 results = self.get_named_results(name) 126 if results and len(results) > 0: 127 return results[len(results) - 1] 128 return None 129 130 def is_valid(self, name: str) -> bool: 131 results = self.get_named_results(name) 132 for r in results: 133 if not r.is_valid: 134 return False 135 return True 136 137 def get_variables(self, name: str) -> bool: 138 results = self.get_named_results(name) 139 vs = {} 140 for r in results: 141 vs = {**r.csvpath.variables, **vs} 142 return vs 143 144 def has_lines(self, name: str) -> bool: 145 """@private""" 146 results = self.get_named_results(name) 147 for r in results: 148 if r.lines and len(r.lines) > 0: 149 return True 150 return False 151 152 def get_number_of_results(self, name: str) -> int: 153 nr = self.get_named_results(name) 154 if nr is None: 155 return 0 156 return len(nr) 157 158 def has_errors(self, name: str) -> bool: 159 results = self.get_named_results(name) 160 for r in results: 161 if r.has_errors(): 162 return True 163 return False 164 165 def get_number_of_errors(self, name: str) -> bool: 166 results = self.get_named_results(name) 167 errors = 0 168 for r in results: 169 errors += r.errors_count() 170 return errors 171 172 def add_named_result(self, result: Result) -> None: 173 """@private""" 174 if result.file_name is None: 175 raise InputException("Results must have a named-file name") 176 if result.paths_name is None: 177 raise InputException("Results must have a named-paths name") 178 name = result.paths_name 179 if name not in self.named_results: 180 self.named_results[name] = [result] 181 else: 182 self.named_results[name].append(result) 183 self._variables = None 184 # 185 # this is the beginning of an identity run within a named-paths run. 186 # run metadata goes to the central record of runs kicking off within 187 # the archive. the run's own more complete record is below as a 188 # separate event. this could change, but atm seems reasonable. 189 # 190 191 mdata = RunMetadata(self.csvpaths.config) 192 mdata.uuid = result.uuid 193 mdata.archive_name = self.csvpaths.config.archive_name 194 mdata.archive_path = self.csvpaths.config.archive_path 195 mdata.time_start = result.run_time 196 mdata.run_home = result.run_dir 197 mdata.identity = result.identity_or_index 198 mdata.named_paths_name = result.paths_name 199 mdata.named_file_name = result.file_name 200 rr = RunRegistrar(self.csvpaths) 201 rr.register_start(mdata) 202 203 # 204 # we prep the results event 205 # 206 # we use the same UUID for both metadata updates because the 207 # UUID represents the run, not the metadata object 208 # 209 210 mdata = ResultMetadata(self.csvpaths.config) 211 mdata.uuid = result.uuid 212 mdata.archive_name = self.csvpaths.config.archive_name 213 mdata.time_started = result.run_time 214 mdata.named_results_name = result.paths_name 215 sep = Nos(result.run_dir).sep 216 mdata.run = result.run_dir[result.run_dir.rfind(sep) + 1 :] 217 mdata.run_home = result.run_dir 218 mdata.instance_home = result.instance_dir 219 mdata.instance_identity = result.identity_or_index 220 mdata.input_data_file = result.file_name 221 rs = ResultSerializer(self._csvpaths.config.archive_path) 222 rr = ResultRegistrar( 223 csvpaths=self.csvpaths, result=result, result_serializer=rs 224 ) 225 rr.register_start(mdata) 226 227 def set_named_results(self, results: Dict[str, List[Result]]) -> None: 228 """@private""" 229 self.named_results = {} 230 for value in results.values(): 231 self.add_named_results(value) 232 233 def add_named_results(self, results: List[Result]) -> None: 234 """@private""" 235 for r in results: 236 self.add_named_result(r) 237 238 def list_named_results(self) -> list[str]: 239 path = self._csvpaths.config.archive_path 240 if Nos(path).dir_exists(): 241 names = Nos(path).listdir() 242 names = [n for n in names if not n.startswith(".")] 243 names.sort() 244 else: 245 self._csvpaths.logger.warning( 246 "Archive %s does not exist. If no runs have been attempted yet this is fine.", 247 path, 248 ) 249 names = [] 250 return names 251 252 def do_transfers_if(self, result) -> None: 253 """@private""" 254 transfers = result.csvpath.transfers 255 if transfers is None: 256 return 257 tpaths = self.transfer_paths(result) 258 self._do_transfers(tpaths) 259 260 def transfer_paths(self, result) -> list[tuple[str, str, str, str]]: 261 """@private""" 262 # 263 # 1: filename, no extension needed: data | unmatched 264 # 2: variable name containing the path to write to 265 # 3: path of source file 266 # 3: path to write to 267 # 268 transfers = result.csvpath.transfers 269 tpaths = [] 270 for t in transfers: 271 filefrom = "data.csv" if t[0].startswith("data") else "unmatched.csv" 272 varname = t[1] 273 pathfrom = self._path_to_result(result, filefrom) 274 pathto = self._path_to_transfer_to(result, varname) 275 tpaths.append((filefrom, varname, pathfrom, pathto)) 276 return tpaths 277 278 def _do_transfers(self, tpaths) -> None: 279 """@private""" 280 for t in tpaths: 281 pathfrom = t[2] 282 pathto = t[3] 283 with DataFileReader(pathfrom) as pf: 284 with DataFileWriter(path=pathto, mode="w") as pt: 285 pt.write(pf.read()) 286 287 def _path_to_transfer_to(self, result, t) -> str: 288 """@private""" 289 p = result.csvpath.config.transfer_root 290 if t not in result.csvpath.variables: 291 raise InputException(f"Variable {t} not found in variables") 292 f = result.csvpath.variables[t] 293 if f.find("..") != -1: 294 raise InputException("Transfer path cannot include '..': {f}") 295 rp = os.path.join(p, f) 296 sep = Nos(rp).sep 297 rd = rp[0 : rp.rfind(sep)] 298 if not Nos(rd).exists(): 299 Nos(rd).makedir() 300 return rp 301 302 def _path_to_result(self, result, t) -> str: 303 """@private""" 304 d = result.instance_dir 305 o = os.path.join(d, t) 306 sep = Nos(o).sep 307 r = o[0 : o.rfind(sep)] 308 if not Nos(r).exists(): 309 Nos(r).makedirs() 310 Nos(r).makedir() 311 return o 312 313 def save(self, result: Result) -> None: 314 """@private""" 315 # 316 # at this time we're not holding on to the result. 317 # we have a place for that, but for now not holding 318 # forces the deserialization to work completely, so 319 # it is worth more than the minor speed up of caching. 320 # 321 if self._csvpaths is None: 322 raise CsvPathsException("Cannot save because there is no CsvPaths instance") 323 if result.lines and isinstance(result.lines, LineSpooler): 324 # we are done spooling. need to close whatever may be open. 325 result.lines.close() 326 # cannot make lines None w/o recreating lines. now we're setting 327 # closed to true to indicate that we've written. 328 # we don't need the serializer trying to save spooled lines 329 # result.lines = None 330 # 331 # if we are doing a transfer(s) do it here so we can put metadata in about 332 # the copy before the metadata is serialized into the results. 333 # 334 self.do_transfers_if(result) 335 rs = ResultSerializer(self._csvpaths.config.archive_path) 336 rs.save_result(result) 337 ResultRegistrar( 338 csvpaths=self.csvpaths, result=result, result_serializer=rs 339 ).register_complete() 340 341 # in this form: $group.results.2024-01-01_10-15-20.mypath 342 def data_file_for_reference(self, refstr, not_name: str = None) -> str: 343 """@private""" 344 ref = ReferenceParser(refstr) 345 if ref.datatype != ReferenceParser.RESULTS: 346 raise InputException( 347 f"Reference datatype must be {ReferenceParser.RESULTS}" 348 ) 349 namedpaths = ref.root_major 350 instance = ref.name_one 351 path = ref.name_three # not used? why? 352 name_three = ref.name_three 353 base = self._csvpaths.config.archive_path 354 filename = os.path.join(base, namedpaths) 355 if not Nos(filename).dir_exists(): 356 raise InputException( 357 f"Reference {refstr} generated {filename} path that does not point to a previously run named-paths group" 358 ) 359 # 360 # instance can have var-subs like: 361 # 2024-01-01_10-15-:last 362 # 2024-01-01_10-:first 363 # 2024-01-01_10-:0 364 # 365 instance = self._find_instance( 366 filename, instance, not_name=not_name, name_three=name_three 367 ) 368 filename = os.path.join(filename, instance) 369 if not Nos(filename).dir_exists(): 370 raise InputException( 371 f"Reference {refstr} does not point to a valid named-paths run file at {filename}" 372 ) 373 filename = os.path.join(filename, path) 374 if not Nos(filename).dir_exists(): 375 raise InputException( 376 f"Reference to {filename} does not point to a csvpath in a named-paths group run" 377 ) 378 filename = os.path.join(filename, "data.csv") 379 if not Nos(filename).exists(): 380 raise InputException( 381 "Reference does not point to a data file resulting from a named-paths group run" 382 ) 383 return filename 384 385 def _find_instance( 386 self, filename, instance, not_name: str = None, name_three: str = None 387 ) -> str: 388 """@private 389 remember that you cannot replay a replay using :last. the reason is that both 390 runs will be looking for the same assets but the last replay run will not have 391 the asset needed. in principle, we could fix this, but in practice, any magic 392 we do to make it always work is going to make the lineage more mysterious. 393 """ 394 c = instance.find(":") 395 if c == -1: 396 filename = os.path.join(filename, instance) 397 return filename 398 if not Nos(filename).dir_exists(): 399 raise InputException(f"The base dir {filename} must exist") 400 var = instance[c:] 401 instance = instance[0:c] 402 ret = None 403 if var == ":last": 404 ret = self._find_last( 405 filename, instance, not_name=not_name, name_three=name_three 406 ) 407 elif var == ":first": 408 ret = self._find_first( 409 filename, instance, not_name=not_name, name_three=name_three 410 ) 411 else: 412 raise InputException(f"Unknown reference var-sub token {var}") 413 return ret 414 415 def _find_last( 416 self, filename, instance, not_name: str = None, name_three: str = None 417 ) -> str: 418 """@private""" 419 last = True 420 return self._find( 421 filename, instance, last, not_name=not_name, name_three=name_three 422 ) 423 424 def _find_first( 425 self, filename, instance, not_name: str = None, name_three: str = None 426 ) -> str: 427 """@private""" 428 first = False 429 return self._find( 430 filename, instance, first, not_name=not_name, name_three=name_three 431 ) 432 433 def _find( 434 self, 435 filename, 436 instance, 437 last: bool = True, 438 not_name: str = None, 439 name_three: str = None, 440 ) -> str: 441 """@private""" 442 names = Nos(filename).listdir() 443 ns = [] 444 for n in names: 445 if not_name is not None and not_name.endswith(n): 446 continue 447 if n.startswith("."): 448 continue 449 # 450 # test for manifest existing here? 451 # 452 mani = os.path.join(filename, n) 453 mani = os.path.join(mani, "manifest.json") 454 if not Nos(mani).exists(): 455 continue 456 if name_three: 457 mani = os.path.join(filename, n) 458 mani = os.path.join(mani, name_three) 459 mani = os.path.join(mani, "manifest.json") 460 if not Nos(mani).exists(): 461 continue 462 ns.append(n) 463 return self._find_in_dir_names(instance, ns, last) 464 465 def _find_in_dir_names(self, instance: str, names, last: bool = True) -> str: 466 """@private""" 467 ms = "%Y-%m-%d_%H-%M-%S.%f" 468 s = "%Y-%m-%d_%H-%M-%S" 469 names = [n for n in names if n.startswith(instance)] 470 if len(names) == 0: 471 return None 472 names = sorted( 473 names, 474 key=lambda x: datetime.datetime.strptime(x, ms if x.find(".") > -1 else s), 475 ) 476 if last is True: 477 i = len(names) 478 # 479 # we drop 1 because -1 for the 0-base. note that we may find a replay 480 # run that doesn't have the asset we're looking for. that's not great 481 # but it is fine -- the rule is, no replays of replays using :last. 482 # it is on the user to set up their replay approprately. 483 # 484 i -= 1 485 if i < 0: 486 self.csvpaths.logger.error( 487 f"Previous run is at count {i} but there is no such run. Returning None." 488 ) 489 self.csvpaths.logger.info( 490 "Found previous runs: %s matching instance: %s", names, instance 491 ) 492 return None 493 ret = names[i] 494 else: 495 ret = names[0] 496 return ret 497 498 def get_run_time_str(self, name, run_time) -> str: 499 """@private""" 500 rs = ResultSerializer(self._csvpaths.config.archive_path) 501 t = rs.get_run_dir(paths_name=name, run_time=run_time) 502 return t 503 504 def remove_named_results(self, name: str) -> None: 505 """@private""" 506 # 507 # does not get rid of results on disk 508 # 509 if name in self.named_results: 510 del self.named_results[name] 511 self._variables = None 512 else: 513 self.csvpaths.logger.warning(f"Results '{name}' not found") 514 # 515 # we treat this as a recoverable error because typically the user 516 # has complete control of the csvpaths environment, making the 517 # problem config that should be addressed. 518 # 519 # if reached by a reference this error should be trapped at an 520 # expression and handled according to the error policy. 521 # 522 raise InputException(f"Results '{name}' not found") 523 524 def clean_named_results(self, name: str) -> None: 525 """@private""" 526 if name in self.named_results: 527 self.remove_named_results(name) 528 # 529 # clean from filesystem too? 530 # 531 532 def get_named_results(self, name) -> List[List[Any]]: 533 # 534 # CsvPaths instances should not be long lived. they are not servers or 535 # agents. for each new run, unless there is a reason to not create a new 536 # CsvPaths instance, we would create a new one. 537 # 538 if name in self.named_results: 539 return self.named_results[name] 540 # 541 # find and load the result, if exists. we find 542 # results home with the name. run_home is the 543 # last run dir. the results we're looking for are 544 # the instance dirs in the run dir. 545 # we'll need another method for getting a specific 546 # run, rather than the default, the last one. 547 # 548 path = os.path.join(self.csvpaths.config.archive_path, name) 549 self.csvpaths.logger.debug( 550 "Attempting to load results for %s from %s", name, path 551 ) 552 if Nos(path).dir_exists(): 553 runs = Nos(path).listdir() 554 runs.sort() 555 run = runs[len(runs) - 1] 556 rs = self.get_named_results_for_run(name=name, run=run) 557 if rs is not None: 558 return rs 559 # 560 # we treat this as a recoverable error because typically the user 561 # has complete control of the csvpaths environment, making the 562 # problem config that should be addressed. 563 # 564 # if reached by a reference this error should be trapped at an 565 # expression and handled according to the error policy. 566 # 567 msg = ( 568 f"Results '{name}' does not exist. Has has that named-paths group been run?" 569 ) 570 self.csvpaths.logger.error(msg) 571 if self.csvpaths.ecoms.do_i_raise(): 572 raise InputException(msg) 573 574 def get_named_results_for_run(self, *, name: str, run: str) -> list[list[Any]]: 575 path = os.path.join(self.csvpaths.config.archive_path, name) 576 path = os.path.join(path, run) 577 instances = Nos(path).listdir() 578 rs = [] 579 for inst in instances: 580 if inst == "manifest.json": 581 continue 582 r = self.get_named_result_for_instance( 583 name=name, run_dir=path, run=run, instance=inst 584 ) 585 rs.append(r) 586 return rs 587 588 def get_named_result_for_instance( 589 self, *, name: str, run_dir: str, run: str, instance: str 590 ) -> list[list[Any]]: 591 instance_dir = os.path.join(run_dir, instance) 592 mani = ResultFileReader.manifest(instance_dir) 593 # 594 # csvpath needs to be loaded with all meta.json->metadata and some/most of runtime_data 595 # 596 csvpath = self.csvpaths.csvpath() 597 meta = ResultFileReader.meta(instance_dir) 598 if meta: 599 # 600 # until there's a clear case for more, this is all we're going to load. 601 # for the most part, people should be using the metadata, not digging into 602 # run objects that may not be current. if we really need to recreate the 603 # csvpath perfectly we should probably go back and rethink. maybe pickle? 604 # 605 csvpath.scanner = Scanner(csvpath=csvpath) 606 csvpath.scanner.parse(meta["runtime_data"]["scan_part"]) 607 csvpath.metadata = meta["metadata"] 608 csvpath.modes.update() 609 csvpath.identity 610 csvpath.scan = meta["runtime_data"]["scan_part"] 611 csvpath.match = meta["runtime_data"]["match_part"] 612 csvpath.delimiter = meta["runtime_data"]["delimiter"] 613 csvpath.quotechar = meta["runtime_data"]["quotechar"] 614 vars = ResultFileReader.vars(instance_dir) 615 if vars: 616 csvpath.variables = vars 617 # 618 # this may not be complete. let's see if it works or needs more. 619 # 620 r = Result( 621 csvpath=csvpath, 622 paths_name=name, 623 run_dir=run_dir, 624 file_name=mani["actual_data_file"], 625 run_index=mani["instance_index"], 626 run_time=dateutil.parser.parse(mani["time"]), 627 runtime_data=meta["runtime_data"], 628 by_line=not bool(mani["serial"]), 629 ) 630 return r
def
get_specific_named_result( self, name: str, name_or_id: str) -> csvpath.managers.results.result.Result:
def
get_specific_named_result_manifest(self, name: str, name_or_id: str) -> dict[str, str | bool]:
114 def get_specific_named_result_manifest( 115 self, name: str, name_or_id: str 116 ) -> dict[str, str | bool]: 117 r = self.get_specific_named_result(name, name_or_id) 118 if r is None: 119 return None 120 rs = ResultSerializer(self._csvpaths.config.archive_path) 121 rr = ResultRegistrar(csvpaths=self.csvpaths, result=r, result_serializer=rs) 122 return rr.manifest
def
get_last_named_result( self, *, name: str, before: str = None) -> csvpath.managers.results.result.Result:
def
list_named_results(self) -> list[str]:
238 def list_named_results(self) -> list[str]: 239 path = self._csvpaths.config.archive_path 240 if Nos(path).dir_exists(): 241 names = Nos(path).listdir() 242 names = [n for n in names if not n.startswith(".")] 243 names.sort() 244 else: 245 self._csvpaths.logger.warning( 246 "Archive %s does not exist. If no runs have been attempted yet this is fine.", 247 path, 248 ) 249 names = [] 250 return names
def
get_named_results(self, name) -> List[List[Any]]:
532 def get_named_results(self, name) -> List[List[Any]]: 533 # 534 # CsvPaths instances should not be long lived. they are not servers or 535 # agents. for each new run, unless there is a reason to not create a new 536 # CsvPaths instance, we would create a new one. 537 # 538 if name in self.named_results: 539 return self.named_results[name] 540 # 541 # find and load the result, if exists. we find 542 # results home with the name. run_home is the 543 # last run dir. the results we're looking for are 544 # the instance dirs in the run dir. 545 # we'll need another method for getting a specific 546 # run, rather than the default, the last one. 547 # 548 path = os.path.join(self.csvpaths.config.archive_path, name) 549 self.csvpaths.logger.debug( 550 "Attempting to load results for %s from %s", name, path 551 ) 552 if Nos(path).dir_exists(): 553 runs = Nos(path).listdir() 554 runs.sort() 555 run = runs[len(runs) - 1] 556 rs = self.get_named_results_for_run(name=name, run=run) 557 if rs is not None: 558 return rs 559 # 560 # we treat this as a recoverable error because typically the user 561 # has complete control of the csvpaths environment, making the 562 # problem config that should be addressed. 563 # 564 # if reached by a reference this error should be trapped at an 565 # expression and handled according to the error policy. 566 # 567 msg = ( 568 f"Results '{name}' does not exist. Has has that named-paths group been run?" 569 ) 570 self.csvpaths.logger.error(msg) 571 if self.csvpaths.ecoms.do_i_raise(): 572 raise InputException(msg)
def
get_named_results_for_run(self, *, name: str, run: str) -> list[list[typing.Any]]:
574 def get_named_results_for_run(self, *, name: str, run: str) -> list[list[Any]]: 575 path = os.path.join(self.csvpaths.config.archive_path, name) 576 path = os.path.join(path, run) 577 instances = Nos(path).listdir() 578 rs = [] 579 for inst in instances: 580 if inst == "manifest.json": 581 continue 582 r = self.get_named_result_for_instance( 583 name=name, run_dir=path, run=run, instance=inst 584 ) 585 rs.append(r) 586 return rs
def
get_named_result_for_instance( self, *, name: str, run_dir: str, run: str, instance: str) -> list[list[typing.Any]]:
588 def get_named_result_for_instance( 589 self, *, name: str, run_dir: str, run: str, instance: str 590 ) -> list[list[Any]]: 591 instance_dir = os.path.join(run_dir, instance) 592 mani = ResultFileReader.manifest(instance_dir) 593 # 594 # csvpath needs to be loaded with all meta.json->metadata and some/most of runtime_data 595 # 596 csvpath = self.csvpaths.csvpath() 597 meta = ResultFileReader.meta(instance_dir) 598 if meta: 599 # 600 # until there's a clear case for more, this is all we're going to load. 601 # for the most part, people should be using the metadata, not digging into 602 # run objects that may not be current. if we really need to recreate the 603 # csvpath perfectly we should probably go back and rethink. maybe pickle? 604 # 605 csvpath.scanner = Scanner(csvpath=csvpath) 606 csvpath.scanner.parse(meta["runtime_data"]["scan_part"]) 607 csvpath.metadata = meta["metadata"] 608 csvpath.modes.update() 609 csvpath.identity 610 csvpath.scan = meta["runtime_data"]["scan_part"] 611 csvpath.match = meta["runtime_data"]["match_part"] 612 csvpath.delimiter = meta["runtime_data"]["delimiter"] 613 csvpath.quotechar = meta["runtime_data"]["quotechar"] 614 vars = ResultFileReader.vars(instance_dir) 615 if vars: 616 csvpath.variables = vars 617 # 618 # this may not be complete. let's see if it works or needs more. 619 # 620 r = Result( 621 csvpath=csvpath, 622 paths_name=name, 623 run_dir=run_dir, 624 file_name=mani["actual_data_file"], 625 run_index=mani["instance_index"], 626 run_time=dateutil.parser.parse(mani["time"]), 627 runtime_data=meta["runtime_data"], 628 by_line=not bool(mani["serial"]), 629 ) 630 return r