csvpath.managers.results.results_manager

  1# pylint: disable=C0114
  2import os
  3from pathlib import Path
  4import datetime
  5import dateutil.parser
  6from typing import Dict, List, Any
  7from csvpath.util.line_spooler import LineSpooler
  8from csvpath.util.exceptions import InputException, CsvPathsException
  9from csvpath.util.reference_parser import ReferenceParser
 10from csvpath.util.file_readers import DataFileReader
 11from csvpath.util.file_writers import DataFileWriter
 12from csvpath.util.nos import Nos
 13from csvpath.scanning.scanner import Scanner
 14from ..run.run_metadata import RunMetadata
 15from ..run.run_registrar import RunRegistrar
 16from .results_metadata import ResultsMetadata
 17from .result_metadata import ResultMetadata
 18from .results_registrar import ResultsRegistrar
 19from .result_registrar import ResultRegistrar
 20from .result_serializer import ResultSerializer
 21from .result import Result
 22from .result_file_reader import ResultFileReader
 23
 24
 25class ResultsManager:  # pylint: disable=C0115
 26    def __init__(self, *, csvpaths=None):
 27        """@private"""
 28        self.named_results = {}
 29        """@private"""
 30        self._csvpaths = None
 31        # use property
 32        self.csvpaths = csvpaths
 33        """@private"""
 34
 35    @property
 36    def csvpaths(self):
 37        """@private"""
 38        return self._csvpaths
 39
 40    @csvpaths.setter
 41    def csvpaths(self, cs) -> None:  # noqa: F821
 42        """@private"""
 43        self._csvpaths = cs
 44
 45    def complete_run(self, *, run_dir, pathsname, results) -> None:
 46        """@private"""
 47        rr = ResultsRegistrar(
 48            csvpaths=self.csvpaths,
 49            run_dir=run_dir,
 50            pathsname=pathsname,
 51            results=results,
 52        )
 53        m = rr.manifest
 54        mdata = ResultsMetadata(self.csvpaths.config)
 55        if "time" not in m or m["time"] is None:
 56            mdata.set_time()
 57        else:
 58            mdata.time_string = m["time"]
 59        mdata.uuid_string = m["uuid"]
 60        mdata.archive_name = self.csvpaths.config.archive_name
 61        mdata.named_file_fingerprint = m["named_file_fingerprint"]
 62        mdata.named_file_fingerprint_on_file = m["named_file_fingerprint_on_file"]
 63        mdata.named_file_name = m["named_file_name"]
 64        mdata.named_file_path = m["named_file_path"]
 65        mdata.run_home = run_dir
 66        mdata.named_paths_name = pathsname
 67        mdata.named_results_name = pathsname
 68        rr.register_complete(mdata)
 69
 70    def start_run(self, *, run_dir, pathsname, filename) -> None:
 71        """@private"""
 72        rr = ResultsRegistrar(
 73            csvpaths=self.csvpaths,
 74            run_dir=run_dir,
 75            pathsname=pathsname,
 76        )
 77        mdata = ResultsMetadata(self.csvpaths.config)
 78        mdata.archive_name = self.csvpaths.config.archive_name
 79        mdata.named_file_name = filename
 80        mdata.run_home = run_dir
 81        mdata.named_paths_name = pathsname
 82        mdata.named_results_name = pathsname
 83        rr.register_start(mdata)
 84
 85    def get_metadata(self, name: str) -> Dict[str, Any]:
 86        """@private
 87        gets the run metadata. will include the metadata complete from
 88        the first results. however, the metadata for individual results must
 89        come direct from them in order to not overwrite"""
 90        results = self.get_named_results(name)
 91        meta = {}
 92        if results and len(results) > 0:
 93            rs = results[0]
 94            path = rs.csvpath
 95            meta["paths_name"] = rs.paths_name
 96            meta["file_name"] = rs.file_name
 97            meta["data_lines"] = path.line_monitor.data_end_line_count
 98            paths = len(self.csvpaths.paths_manager.get_named_paths(name))
 99            meta["csvpaths_applied"] = paths
100            meta["csvpaths_completed"] = paths == len(results)
101            meta["valid"] = self.is_valid(name)
102            meta = {**meta, **rs.csvpath.metadata}
103        return meta
104
105    def get_specific_named_result(self, name: str, name_or_id: str) -> Result:
106        results = self.get_named_results(name)
107        if results and len(results) > 0:
108            for r in results:
109                if name_or_id == r.csvpath.identity:
110                    return r
111        return None  # pragma: no cover
112
113    def get_specific_named_result_manifest(
114        self, name: str, name_or_id: str
115    ) -> dict[str, str | bool]:
116        r = self.get_specific_named_result(name, name_or_id)
117        if r is None:
118            return None
119        rs = ResultSerializer(self._csvpaths.config.archive_path)
120        rr = ResultRegistrar(csvpaths=self.csvpaths, result=r, result_serializer=rs)
121        return rr.manifest
122
123    def get_last_named_result(self, *, name: str, before: str = None) -> Result:
124        results = self.get_named_results(name)
125        if results and len(results) > 0:
126            return results[len(results) - 1]
127        return None
128
129    def is_valid(self, name: str) -> bool:
130        results = self.get_named_results(name)
131        for r in results:
132            if not r.is_valid:
133                return False
134        return True
135
136    def get_variables(self, name: str) -> bool:
137        results = self.get_named_results(name)
138        vs = {}
139        for r in results:
140            vs = {**r.csvpath.variables, **vs}
141        return vs
142
143    def has_lines(self, name: str) -> bool:
144        """@private"""
145        results = self.get_named_results(name)
146        for r in results:
147            if r.lines and len(r.lines) > 0:
148                return True
149        return False
150
151    def get_number_of_results(self, name: str) -> int:
152        nr = self.get_named_results(name)
153        if nr is None:
154            return 0
155        return len(nr)
156
157    def has_errors(self, name: str) -> bool:
158        results = self.get_named_results(name)
159        for r in results:
160            if r.has_errors():
161                return True
162        return False
163
164    def get_number_of_errors(self, name: str) -> bool:
165        results = self.get_named_results(name)
166        errors = 0
167        for r in results:
168            errors += r.errors_count()
169        return errors
170
171    def add_named_result(self, result: Result) -> None:
172        """@private"""
173        if result.file_name is None:
174            raise InputException("Results must have a named-file name")
175        if result.paths_name is None:
176            raise InputException("Results must have a named-paths name")
177        name = result.paths_name
178        if name not in self.named_results:
179            self.named_results[name] = [result]
180        else:
181            self.named_results[name].append(result)
182        self._variables = None
183        #
184        # this is the beginning of an identity run within a named-paths run.
185        # run metadata goes to the central record of runs kicking off within
186        # the archive. the run's own more complete record is below as a
187        # separate event. this could change, but atm seems reasonable.
188        #
189
190        mdata = RunMetadata(self.csvpaths.config)
191        mdata.uuid = result.uuid
192        mdata.archive_name = self.csvpaths.config.archive_name
193        mdata.archive_path = self.csvpaths.config.archive_path
194        mdata.time_start = result.run_time
195        mdata.run_home = result.run_dir
196        mdata.identity = result.identity_or_index
197        mdata.named_paths_name = result.paths_name
198        mdata.named_file_name = result.file_name
199        rr = RunRegistrar(self.csvpaths)
200        rr.register_start(mdata)
201
202        #
203        # we prep the results event
204        #
205        # we use the same UUID for both metadata updates because the
206        # UUID represents the run, not the metadata object
207        #
208
209        mdata = ResultMetadata(self.csvpaths.config)
210        mdata.uuid = result.uuid
211        mdata.archive_name = self.csvpaths.config.archive_name
212        mdata.time_started = result.run_time
213        mdata.named_results_name = result.paths_name
214        sep = Nos(result.run_dir).sep
215        mdata.run = result.run_dir[result.run_dir.rfind(sep) + 1 :]
216        mdata.run_home = result.run_dir
217        mdata.instance_home = result.instance_dir
218        mdata.instance_identity = result.identity_or_index
219        mdata.input_data_file = result.file_name
220        rs = ResultSerializer(self._csvpaths.config.archive_path)
221        rr = ResultRegistrar(
222            csvpaths=self.csvpaths, result=result, result_serializer=rs
223        )
224        rr.register_start(mdata)
225
226    def set_named_results(self, results: Dict[str, List[Result]]) -> None:
227        """@private"""
228        self.named_results = {}
229        for value in results.values():
230            self.add_named_results(value)
231
232    def add_named_results(self, results: List[Result]) -> None:
233        """@private"""
234        for r in results:
235            self.add_named_result(r)
236
237    def list_named_results(self) -> list[str]:
238        path = self._csvpaths.config.archive_path
239        if Nos(path).dir_exists():
240            names = Nos(path).listdir()
241            names = [n for n in names if not n.startswith(".")]
242            names.sort()
243        else:
244            self._csvpaths.logger.warning(
245                "Archive %s does not exist. If no runs have been attempted yet this is fine.",
246                path,
247            )
248            names = []
249        return names
250
251    def do_transfers_if(self, result) -> None:
252        """@private"""
253        transfers = result.csvpath.transfers
254        if transfers is None:
255            return
256        tpaths = self.transfer_paths(result)
257        self._do_transfers(tpaths)
258
259    def transfer_paths(self, result) -> list[tuple[str, str, str, str]]:
260        """@private"""
261        #
262        # 1: filename, no extension needed: data | unmatched
263        # 2: variable name containing the path to write to
264        # 3: path of source file
265        # 3: path to write to
266        #
267        transfers = result.csvpath.transfers
268        tpaths = []
269        for t in transfers:
270            filefrom = "data.csv" if t[0].startswith("data") else "unmatched.csv"
271            varname = t[1]
272            pathfrom = self._path_to_result(result, filefrom)
273            pathto = self._path_to_transfer_to(result, varname)
274            tpaths.append((filefrom, varname, pathfrom, pathto))
275        return tpaths
276
277    def _do_transfers(self, tpaths) -> None:
278        """@private"""
279        for t in tpaths:
280            pathfrom = t[2]
281            pathto = t[3]
282            with DataFileReader(pathfrom) as pf:
283                with DataFileWriter(path=pathto, mode="w") as pt:
284                    pt.write(pf.read())
285
286    def _path_to_transfer_to(self, result, t) -> str:
287        """@private"""
288        p = result.csvpath.config.transfer_root
289        if t not in result.csvpath.variables:
290            raise InputException(f"Variable {t} not found in variables")
291        f = result.csvpath.variables[t]
292        if f.find("..") != -1:
293            raise InputException("Transfer path cannot include '..': {f}")
294        rp = os.path.join(p, f)
295        sep = Nos(rp).sep
296        rd = rp[0 : rp.rfind(sep)]
297        if not Nos(rd).exists():
298            Nos(rd).makedir()
299        return rp
300
301    def _path_to_result(self, result, t) -> str:
302        """@private"""
303        d = result.instance_dir
304        o = os.path.join(d, t)
305        sep = Nos(o).sep
306        r = o[0 : o.rfind(sep)]
307        if not Nos(r).exists():
308            Nos(r).makedirs()
309            Nos(r).makedir()
310        return o
311
312    def save(self, result: Result) -> None:
313        """@private"""
314        #
315        # at this time we're not holding on to the result.
316        # we have a place for that, but for now not holding
317        # forces the deserialization to work completely, so
318        # it is worth more than the minor speed up of caching.
319        #
320        if self._csvpaths is None:
321            raise CsvPathsException("Cannot save because there is no CsvPaths instance")
322        if result.lines and isinstance(result.lines, LineSpooler):
323            # we are done spooling. need to close whatever may be open.
324            result.lines.close()
325            # cannot make lines None w/o recreating lines. now we're setting
326            # closed to true to indicate that we've written.
327            # we don't need the serializer trying to save spooled lines
328            # result.lines = None
329        #
330        # if we are doing a transfer(s) do it here so we can put metadata in about
331        # the copy before the metadata is serialized into the results.
332        #
333        self.do_transfers_if(result)
334        rs = ResultSerializer(self._csvpaths.config.archive_path)
335        rs.save_result(result)
336        ResultRegistrar(
337            csvpaths=self.csvpaths, result=result, result_serializer=rs
338        ).register_complete()
339
340    # in this form: $group.results.2024-01-01_10-15-20.mypath
341    def data_file_for_reference(self, refstr, not_name: str = None) -> str:
342        """@private"""
343        ref = ReferenceParser(refstr)
344        if ref.datatype != ReferenceParser.RESULTS:
345            raise InputException(
346                f"Reference datatype must be {ReferenceParser.RESULTS}"
347            )
348        namedpaths = ref.root_major
349        instance = ref.name_one
350        path = ref.name_three  # not used? why?
351        name_three = ref.name_three
352        base = self._csvpaths.config.archive_path
353        filename = os.path.join(base, namedpaths)
354        if not Nos(filename).dir_exists():
355            raise InputException(
356                f"Reference {refstr} generated {filename} path that does not point to a previously run named-paths group"
357            )
358        #
359        # instance can have var-subs like:
360        #   2024-01-01_10-15-:last
361        #   2024-01-01_10-:first
362        #   2024-01-01_10-:0
363        #
364        instance = self._find_instance(
365            filename, instance, not_name=not_name, name_three=name_three
366        )
367        filename = os.path.join(filename, instance)
368        if not Nos(filename).dir_exists():
369            raise InputException(
370                f"Reference {refstr} does not point to a valid named-paths run file at {filename}"
371            )
372        filename = os.path.join(filename, path)
373        if not Nos(filename).dir_exists():
374            raise InputException(
375                f"Reference to {filename} does not point to a csvpath in a named-paths group run"
376            )
377        filename = os.path.join(filename, "data.csv")
378        if not Nos(filename).exists():
379            raise InputException(
380                "Reference does not point to a data file resulting from a named-paths group run"
381            )
382        return filename
383
384    def _find_instance(
385        self, filename, instance, not_name: str = None, name_three: str = None
386    ) -> str:
387        """@private
388        remember that you cannot replay a replay using :last. the reason is that both
389        runs will be looking for the same assets but the last replay run will not have
390        the asset needed. in principle, we could fix this, but in practice, any magic
391        we do to make it always work is going to make the lineage more mysterious.
392        """
393        c = instance.find(":")
394        if c == -1:
395            filename = os.path.join(filename, instance)
396            return filename
397        if not Nos(filename).dir_exists():
398            raise InputException(f"The base dir {filename} must exist")
399        var = instance[c:]
400        instance = instance[0:c]
401        ret = None
402        if var == ":last":
403            ret = self._find_last(
404                filename, instance, not_name=not_name, name_three=name_three
405            )
406        elif var == ":first":
407            ret = self._find_first(
408                filename, instance, not_name=not_name, name_three=name_three
409            )
410        else:
411            raise InputException(f"Unknown reference var-sub token {var}")
412        return ret
413
414    def _find_last(
415        self, filename, instance, not_name: str = None, name_three: str = None
416    ) -> str:
417        """@private"""
418        last = True
419        return self._find(
420            filename, instance, last, not_name=not_name, name_three=name_three
421        )
422
423    def _find_first(
424        self, filename, instance, not_name: str = None, name_three: str = None
425    ) -> str:
426        """@private"""
427        first = False
428        return self._find(
429            filename, instance, first, not_name=not_name, name_three=name_three
430        )
431
432    def _find(
433        self,
434        filename,
435        instance,
436        last: bool = True,
437        not_name: str = None,
438        name_three: str = None,
439    ) -> str:
440        """@private"""
441        names = Nos(filename).listdir()
442        ns = []
443        for n in names:
444            if not_name is not None and not_name.endswith(n):
445                continue
446            if n.startswith("."):
447                continue
448            #
449            # test for manifest existing here?
450            #
451            mani = os.path.join(filename, n)
452            mani = os.path.join(mani, "manifest.json")
453            if not Nos(mani).exists():
454                continue
455            if name_three:
456                mani = os.path.join(filename, n)
457                mani = os.path.join(mani, name_three)
458                mani = os.path.join(mani, "manifest.json")
459                if not Nos(mani).exists():
460                    continue
461            ns.append(n)
462        return self._find_in_dir_names(instance, ns, last)
463
464    def _find_in_dir_names(self, instance: str, names, last: bool = True) -> str:
465        """@private"""
466        ms = "%Y-%m-%d_%H-%M-%S.%f"
467        s = "%Y-%m-%d_%H-%M-%S"
468        names = [n for n in names if n.startswith(instance)]
469        if len(names) == 0:
470            return None
471        names = sorted(
472            names,
473            key=lambda x: datetime.datetime.strptime(x, ms if x.find(".") > -1 else s),
474        )
475        if last is True:
476            i = len(names)
477            #
478            # we drop 1 because -1 for the 0-base. note that we may find a replay
479            # run that doesn't have the asset we're looking for. that's not great
480            # but it is fine -- the rule is, no replays of replays using :last.
481            # it is on the user to set up their replay approprately.
482            #
483            i -= 1
484            if i < 0:
485                self.csvpaths.logger.error(
486                    f"Previous run is at count {i} but there is no such run. Returning None."
487                )
488                self.csvpaths.logger.info(
489                    "Found previous runs: %s matching instance: %s", names, instance
490                )
491                return None
492            ret = names[i]
493        else:
494            ret = names[0]
495        return ret
496
497    def get_run_time_str(self, name, run_time) -> str:
498        """@private"""
499        rs = ResultSerializer(self._csvpaths.config.archive_path)
500        t = rs.get_run_dir(paths_name=name, run_time=run_time)
501        return t
502
503    def remove_named_results(self, name: str) -> None:
504        """@private"""
505        #
506        # does not get rid of results on disk
507        #
508        if name in self.named_results:
509            del self.named_results[name]
510            self._variables = None
511        else:
512            self.csvpaths.logger.warning(f"Results '{name}' not found")
513            #
514            # we treat this as a recoverable error because typically the user
515            # has complete control of the csvpaths environment, making the
516            # problem config that should be addressed.
517            #
518            # if reached by a reference this error should be trapped at an
519            # expression and handled according to the error policy.
520            #
521            raise InputException(f"Results '{name}' not found")
522
523    def clean_named_results(self, name: str) -> None:
524        """@private"""
525        if name in self.named_results:
526            self.remove_named_results(name)
527            #
528            # clean from filesystem too?
529            #
530
531    def get_named_results(self, name) -> List[List[Any]]:
532        #
533        # CsvPaths instances should not be long lived. they are not servers or
534        # agents. for each new run, unless there is a reason to not create a new
535        # CsvPaths instance, we would create a new one.
536        #
537        if name in self.named_results:
538            return self.named_results[name]
539        #
540        # find and load the result, if exists. we find
541        # results home with the name. run_home is the
542        # last run dir. the results we're looking for are
543        # the instance dirs in the run dir.
544        # we'll need another method for getting a specific
545        # run, rather than the default, the last one.
546        #
547        path = os.path.join(self.csvpaths.config.archive_path, name)
548        self.csvpaths.logger.debug(
549            "Attempting to load results for %s from %s", name, path
550        )
551        if Nos(path).dir_exists():
552            runs = Nos(path).listdir()
553            runs.sort()
554            run = runs[len(runs) - 1]
555            rs = self.get_named_results_for_run(name=name, run=run)
556            if rs is not None:
557                return rs
558        #
559        # we treat this as a recoverable error because typically the user
560        # has complete control of the csvpaths environment, making the
561        # problem config that should be addressed.
562        #
563        # if reached by a reference this error should be trapped at an
564        # expression and handled according to the error policy.
565        #
566        msg = (
567            f"Results '{name}' does not exist. Has has that named-paths group been run?"
568        )
569        self.csvpaths.logger.error(msg)
570        if self.csvpaths.ecoms.do_i_raise():
571            raise InputException(msg)
572
573    def get_named_results_for_run(self, *, name: str, run: str) -> list[list[Any]]:
574        path = os.path.join(self.csvpaths.config.archive_path, name)
575        path = os.path.join(path, run)
576        instances = Nos(path).listdir()
577        rs = []
578        for inst in instances:
579            if inst == "manifest.json":
580                continue
581            r = self.get_named_result_for_instance(
582                name=name, run_dir=path, run=run, instance=inst
583            )
584            rs.append(r)
585        return rs
586
587    def get_named_result_for_instance(
588        self, *, name: str, run_dir: str, run: str, instance: str
589    ) -> list[list[Any]]:
590        instance_dir = os.path.join(run_dir, instance)
591        mani = ResultFileReader.manifest(instance_dir)
592        #
593        # csvpath needs to be loaded with all meta.json->metadata and some/most of runtime_data
594        #
595        csvpath = self.csvpaths.csvpath()
596        meta = ResultFileReader.meta(instance_dir)
597        if meta:
598            #
599            # until there's a clear case for more, this is all we're going to load.
600            # for the most part, people should be using the metadata, not digging into
601            # run objects that may not be current. if we really need to recreate the
602            # csvpath perfectly we should probably go back and rethink. maybe pickle?
603            #
604            csvpath.scanner = Scanner(csvpath=csvpath)
605            csvpath.scanner.parse(meta["runtime_data"]["scan_part"])
606            csvpath.metadata = meta["metadata"]
607            csvpath.modes.update()
608            csvpath.identity
609            csvpath.scan = meta["runtime_data"]["scan_part"]
610            csvpath.match = meta["runtime_data"]["match_part"]
611            csvpath.delimiter = meta["runtime_data"]["delimiter"]
612            csvpath.quotechar = meta["runtime_data"]["quotechar"]
613        vars = ResultFileReader.vars(instance_dir)
614        if vars:
615            csvpath.variables = vars
616        #
617        # this may not be complete. let's see if it works or needs more.
618        #
619        r = Result(
620            csvpath=csvpath,
621            paths_name=name,
622            run_dir=run_dir,
623            file_name=mani["actual_data_file"],
624            run_index=mani["instance_index"],
625            run_time=dateutil.parser.parse(mani["time"]),
626            runtime_data=meta["runtime_data"],
627            by_line=not bool(mani["serial"]),
628        )
629        return r
class ResultsManager:
 26class ResultsManager:  # pylint: disable=C0115
 27    def __init__(self, *, csvpaths=None):
 28        """@private"""
 29        self.named_results = {}
 30        """@private"""
 31        self._csvpaths = None
 32        # use property
 33        self.csvpaths = csvpaths
 34        """@private"""
 35
 36    @property
 37    def csvpaths(self):
 38        """@private"""
 39        return self._csvpaths
 40
 41    @csvpaths.setter
 42    def csvpaths(self, cs) -> None:  # noqa: F821
 43        """@private"""
 44        self._csvpaths = cs
 45
 46    def complete_run(self, *, run_dir, pathsname, results) -> None:
 47        """@private"""
 48        rr = ResultsRegistrar(
 49            csvpaths=self.csvpaths,
 50            run_dir=run_dir,
 51            pathsname=pathsname,
 52            results=results,
 53        )
 54        m = rr.manifest
 55        mdata = ResultsMetadata(self.csvpaths.config)
 56        if "time" not in m or m["time"] is None:
 57            mdata.set_time()
 58        else:
 59            mdata.time_string = m["time"]
 60        mdata.uuid_string = m["uuid"]
 61        mdata.archive_name = self.csvpaths.config.archive_name
 62        mdata.named_file_fingerprint = m["named_file_fingerprint"]
 63        mdata.named_file_fingerprint_on_file = m["named_file_fingerprint_on_file"]
 64        mdata.named_file_name = m["named_file_name"]
 65        mdata.named_file_path = m["named_file_path"]
 66        mdata.run_home = run_dir
 67        mdata.named_paths_name = pathsname
 68        mdata.named_results_name = pathsname
 69        rr.register_complete(mdata)
 70
 71    def start_run(self, *, run_dir, pathsname, filename) -> None:
 72        """@private"""
 73        rr = ResultsRegistrar(
 74            csvpaths=self.csvpaths,
 75            run_dir=run_dir,
 76            pathsname=pathsname,
 77        )
 78        mdata = ResultsMetadata(self.csvpaths.config)
 79        mdata.archive_name = self.csvpaths.config.archive_name
 80        mdata.named_file_name = filename
 81        mdata.run_home = run_dir
 82        mdata.named_paths_name = pathsname
 83        mdata.named_results_name = pathsname
 84        rr.register_start(mdata)
 85
 86    def get_metadata(self, name: str) -> Dict[str, Any]:
 87        """@private
 88        gets the run metadata. will include the metadata complete from
 89        the first results. however, the metadata for individual results must
 90        come direct from them in order to not overwrite"""
 91        results = self.get_named_results(name)
 92        meta = {}
 93        if results and len(results) > 0:
 94            rs = results[0]
 95            path = rs.csvpath
 96            meta["paths_name"] = rs.paths_name
 97            meta["file_name"] = rs.file_name
 98            meta["data_lines"] = path.line_monitor.data_end_line_count
 99            paths = len(self.csvpaths.paths_manager.get_named_paths(name))
100            meta["csvpaths_applied"] = paths
101            meta["csvpaths_completed"] = paths == len(results)
102            meta["valid"] = self.is_valid(name)
103            meta = {**meta, **rs.csvpath.metadata}
104        return meta
105
106    def get_specific_named_result(self, name: str, name_or_id: str) -> Result:
107        results = self.get_named_results(name)
108        if results and len(results) > 0:
109            for r in results:
110                if name_or_id == r.csvpath.identity:
111                    return r
112        return None  # pragma: no cover
113
114    def get_specific_named_result_manifest(
115        self, name: str, name_or_id: str
116    ) -> dict[str, str | bool]:
117        r = self.get_specific_named_result(name, name_or_id)
118        if r is None:
119            return None
120        rs = ResultSerializer(self._csvpaths.config.archive_path)
121        rr = ResultRegistrar(csvpaths=self.csvpaths, result=r, result_serializer=rs)
122        return rr.manifest
123
124    def get_last_named_result(self, *, name: str, before: str = None) -> Result:
125        results = self.get_named_results(name)
126        if results and len(results) > 0:
127            return results[len(results) - 1]
128        return None
129
130    def is_valid(self, name: str) -> bool:
131        results = self.get_named_results(name)
132        for r in results:
133            if not r.is_valid:
134                return False
135        return True
136
137    def get_variables(self, name: str) -> bool:
138        results = self.get_named_results(name)
139        vs = {}
140        for r in results:
141            vs = {**r.csvpath.variables, **vs}
142        return vs
143
144    def has_lines(self, name: str) -> bool:
145        """@private"""
146        results = self.get_named_results(name)
147        for r in results:
148            if r.lines and len(r.lines) > 0:
149                return True
150        return False
151
152    def get_number_of_results(self, name: str) -> int:
153        nr = self.get_named_results(name)
154        if nr is None:
155            return 0
156        return len(nr)
157
158    def has_errors(self, name: str) -> bool:
159        results = self.get_named_results(name)
160        for r in results:
161            if r.has_errors():
162                return True
163        return False
164
165    def get_number_of_errors(self, name: str) -> bool:
166        results = self.get_named_results(name)
167        errors = 0
168        for r in results:
169            errors += r.errors_count()
170        return errors
171
172    def add_named_result(self, result: Result) -> None:
173        """@private"""
174        if result.file_name is None:
175            raise InputException("Results must have a named-file name")
176        if result.paths_name is None:
177            raise InputException("Results must have a named-paths name")
178        name = result.paths_name
179        if name not in self.named_results:
180            self.named_results[name] = [result]
181        else:
182            self.named_results[name].append(result)
183        self._variables = None
184        #
185        # this is the beginning of an identity run within a named-paths run.
186        # run metadata goes to the central record of runs kicking off within
187        # the archive. the run's own more complete record is below as a
188        # separate event. this could change, but atm seems reasonable.
189        #
190
191        mdata = RunMetadata(self.csvpaths.config)
192        mdata.uuid = result.uuid
193        mdata.archive_name = self.csvpaths.config.archive_name
194        mdata.archive_path = self.csvpaths.config.archive_path
195        mdata.time_start = result.run_time
196        mdata.run_home = result.run_dir
197        mdata.identity = result.identity_or_index
198        mdata.named_paths_name = result.paths_name
199        mdata.named_file_name = result.file_name
200        rr = RunRegistrar(self.csvpaths)
201        rr.register_start(mdata)
202
203        #
204        # we prep the results event
205        #
206        # we use the same UUID for both metadata updates because the
207        # UUID represents the run, not the metadata object
208        #
209
210        mdata = ResultMetadata(self.csvpaths.config)
211        mdata.uuid = result.uuid
212        mdata.archive_name = self.csvpaths.config.archive_name
213        mdata.time_started = result.run_time
214        mdata.named_results_name = result.paths_name
215        sep = Nos(result.run_dir).sep
216        mdata.run = result.run_dir[result.run_dir.rfind(sep) + 1 :]
217        mdata.run_home = result.run_dir
218        mdata.instance_home = result.instance_dir
219        mdata.instance_identity = result.identity_or_index
220        mdata.input_data_file = result.file_name
221        rs = ResultSerializer(self._csvpaths.config.archive_path)
222        rr = ResultRegistrar(
223            csvpaths=self.csvpaths, result=result, result_serializer=rs
224        )
225        rr.register_start(mdata)
226
227    def set_named_results(self, results: Dict[str, List[Result]]) -> None:
228        """@private"""
229        self.named_results = {}
230        for value in results.values():
231            self.add_named_results(value)
232
233    def add_named_results(self, results: List[Result]) -> None:
234        """@private"""
235        for r in results:
236            self.add_named_result(r)
237
238    def list_named_results(self) -> list[str]:
239        path = self._csvpaths.config.archive_path
240        if Nos(path).dir_exists():
241            names = Nos(path).listdir()
242            names = [n for n in names if not n.startswith(".")]
243            names.sort()
244        else:
245            self._csvpaths.logger.warning(
246                "Archive %s does not exist. If no runs have been attempted yet this is fine.",
247                path,
248            )
249            names = []
250        return names
251
252    def do_transfers_if(self, result) -> None:
253        """@private"""
254        transfers = result.csvpath.transfers
255        if transfers is None:
256            return
257        tpaths = self.transfer_paths(result)
258        self._do_transfers(tpaths)
259
260    def transfer_paths(self, result) -> list[tuple[str, str, str, str]]:
261        """@private"""
262        #
263        # 1: filename, no extension needed: data | unmatched
264        # 2: variable name containing the path to write to
265        # 3: path of source file
266        # 3: path to write to
267        #
268        transfers = result.csvpath.transfers
269        tpaths = []
270        for t in transfers:
271            filefrom = "data.csv" if t[0].startswith("data") else "unmatched.csv"
272            varname = t[1]
273            pathfrom = self._path_to_result(result, filefrom)
274            pathto = self._path_to_transfer_to(result, varname)
275            tpaths.append((filefrom, varname, pathfrom, pathto))
276        return tpaths
277
278    def _do_transfers(self, tpaths) -> None:
279        """@private"""
280        for t in tpaths:
281            pathfrom = t[2]
282            pathto = t[3]
283            with DataFileReader(pathfrom) as pf:
284                with DataFileWriter(path=pathto, mode="w") as pt:
285                    pt.write(pf.read())
286
287    def _path_to_transfer_to(self, result, t) -> str:
288        """@private"""
289        p = result.csvpath.config.transfer_root
290        if t not in result.csvpath.variables:
291            raise InputException(f"Variable {t} not found in variables")
292        f = result.csvpath.variables[t]
293        if f.find("..") != -1:
294            raise InputException("Transfer path cannot include '..': {f}")
295        rp = os.path.join(p, f)
296        sep = Nos(rp).sep
297        rd = rp[0 : rp.rfind(sep)]
298        if not Nos(rd).exists():
299            Nos(rd).makedir()
300        return rp
301
302    def _path_to_result(self, result, t) -> str:
303        """@private"""
304        d = result.instance_dir
305        o = os.path.join(d, t)
306        sep = Nos(o).sep
307        r = o[0 : o.rfind(sep)]
308        if not Nos(r).exists():
309            Nos(r).makedirs()
310            Nos(r).makedir()
311        return o
312
313    def save(self, result: Result) -> None:
314        """@private"""
315        #
316        # at this time we're not holding on to the result.
317        # we have a place for that, but for now not holding
318        # forces the deserialization to work completely, so
319        # it is worth more than the minor speed up of caching.
320        #
321        if self._csvpaths is None:
322            raise CsvPathsException("Cannot save because there is no CsvPaths instance")
323        if result.lines and isinstance(result.lines, LineSpooler):
324            # we are done spooling. need to close whatever may be open.
325            result.lines.close()
326            # cannot make lines None w/o recreating lines. now we're setting
327            # closed to true to indicate that we've written.
328            # we don't need the serializer trying to save spooled lines
329            # result.lines = None
330        #
331        # if we are doing a transfer(s) do it here so we can put metadata in about
332        # the copy before the metadata is serialized into the results.
333        #
334        self.do_transfers_if(result)
335        rs = ResultSerializer(self._csvpaths.config.archive_path)
336        rs.save_result(result)
337        ResultRegistrar(
338            csvpaths=self.csvpaths, result=result, result_serializer=rs
339        ).register_complete()
340
341    # in this form: $group.results.2024-01-01_10-15-20.mypath
342    def data_file_for_reference(self, refstr, not_name: str = None) -> str:
343        """@private"""
344        ref = ReferenceParser(refstr)
345        if ref.datatype != ReferenceParser.RESULTS:
346            raise InputException(
347                f"Reference datatype must be {ReferenceParser.RESULTS}"
348            )
349        namedpaths = ref.root_major
350        instance = ref.name_one
351        path = ref.name_three  # not used? why?
352        name_three = ref.name_three
353        base = self._csvpaths.config.archive_path
354        filename = os.path.join(base, namedpaths)
355        if not Nos(filename).dir_exists():
356            raise InputException(
357                f"Reference {refstr} generated {filename} path that does not point to a previously run named-paths group"
358            )
359        #
360        # instance can have var-subs like:
361        #   2024-01-01_10-15-:last
362        #   2024-01-01_10-:first
363        #   2024-01-01_10-:0
364        #
365        instance = self._find_instance(
366            filename, instance, not_name=not_name, name_three=name_three
367        )
368        filename = os.path.join(filename, instance)
369        if not Nos(filename).dir_exists():
370            raise InputException(
371                f"Reference {refstr} does not point to a valid named-paths run file at {filename}"
372            )
373        filename = os.path.join(filename, path)
374        if not Nos(filename).dir_exists():
375            raise InputException(
376                f"Reference to {filename} does not point to a csvpath in a named-paths group run"
377            )
378        filename = os.path.join(filename, "data.csv")
379        if not Nos(filename).exists():
380            raise InputException(
381                "Reference does not point to a data file resulting from a named-paths group run"
382            )
383        return filename
384
385    def _find_instance(
386        self, filename, instance, not_name: str = None, name_three: str = None
387    ) -> str:
388        """@private
389        remember that you cannot replay a replay using :last. the reason is that both
390        runs will be looking for the same assets but the last replay run will not have
391        the asset needed. in principle, we could fix this, but in practice, any magic
392        we do to make it always work is going to make the lineage more mysterious.
393        """
394        c = instance.find(":")
395        if c == -1:
396            filename = os.path.join(filename, instance)
397            return filename
398        if not Nos(filename).dir_exists():
399            raise InputException(f"The base dir {filename} must exist")
400        var = instance[c:]
401        instance = instance[0:c]
402        ret = None
403        if var == ":last":
404            ret = self._find_last(
405                filename, instance, not_name=not_name, name_three=name_three
406            )
407        elif var == ":first":
408            ret = self._find_first(
409                filename, instance, not_name=not_name, name_three=name_three
410            )
411        else:
412            raise InputException(f"Unknown reference var-sub token {var}")
413        return ret
414
415    def _find_last(
416        self, filename, instance, not_name: str = None, name_three: str = None
417    ) -> str:
418        """@private"""
419        last = True
420        return self._find(
421            filename, instance, last, not_name=not_name, name_three=name_three
422        )
423
424    def _find_first(
425        self, filename, instance, not_name: str = None, name_three: str = None
426    ) -> str:
427        """@private"""
428        first = False
429        return self._find(
430            filename, instance, first, not_name=not_name, name_three=name_three
431        )
432
433    def _find(
434        self,
435        filename,
436        instance,
437        last: bool = True,
438        not_name: str = None,
439        name_three: str = None,
440    ) -> str:
441        """@private"""
442        names = Nos(filename).listdir()
443        ns = []
444        for n in names:
445            if not_name is not None and not_name.endswith(n):
446                continue
447            if n.startswith("."):
448                continue
449            #
450            # test for manifest existing here?
451            #
452            mani = os.path.join(filename, n)
453            mani = os.path.join(mani, "manifest.json")
454            if not Nos(mani).exists():
455                continue
456            if name_three:
457                mani = os.path.join(filename, n)
458                mani = os.path.join(mani, name_three)
459                mani = os.path.join(mani, "manifest.json")
460                if not Nos(mani).exists():
461                    continue
462            ns.append(n)
463        return self._find_in_dir_names(instance, ns, last)
464
465    def _find_in_dir_names(self, instance: str, names, last: bool = True) -> str:
466        """@private"""
467        ms = "%Y-%m-%d_%H-%M-%S.%f"
468        s = "%Y-%m-%d_%H-%M-%S"
469        names = [n for n in names if n.startswith(instance)]
470        if len(names) == 0:
471            return None
472        names = sorted(
473            names,
474            key=lambda x: datetime.datetime.strptime(x, ms if x.find(".") > -1 else s),
475        )
476        if last is True:
477            i = len(names)
478            #
479            # we drop 1 because -1 for the 0-base. note that we may find a replay
480            # run that doesn't have the asset we're looking for. that's not great
481            # but it is fine -- the rule is, no replays of replays using :last.
482            # it is on the user to set up their replay approprately.
483            #
484            i -= 1
485            if i < 0:
486                self.csvpaths.logger.error(
487                    f"Previous run is at count {i} but there is no such run. Returning None."
488                )
489                self.csvpaths.logger.info(
490                    "Found previous runs: %s matching instance: %s", names, instance
491                )
492                return None
493            ret = names[i]
494        else:
495            ret = names[0]
496        return ret
497
498    def get_run_time_str(self, name, run_time) -> str:
499        """@private"""
500        rs = ResultSerializer(self._csvpaths.config.archive_path)
501        t = rs.get_run_dir(paths_name=name, run_time=run_time)
502        return t
503
504    def remove_named_results(self, name: str) -> None:
505        """@private"""
506        #
507        # does not get rid of results on disk
508        #
509        if name in self.named_results:
510            del self.named_results[name]
511            self._variables = None
512        else:
513            self.csvpaths.logger.warning(f"Results '{name}' not found")
514            #
515            # we treat this as a recoverable error because typically the user
516            # has complete control of the csvpaths environment, making the
517            # problem config that should be addressed.
518            #
519            # if reached by a reference this error should be trapped at an
520            # expression and handled according to the error policy.
521            #
522            raise InputException(f"Results '{name}' not found")
523
524    def clean_named_results(self, name: str) -> None:
525        """@private"""
526        if name in self.named_results:
527            self.remove_named_results(name)
528            #
529            # clean from filesystem too?
530            #
531
532    def get_named_results(self, name) -> List[List[Any]]:
533        #
534        # CsvPaths instances should not be long lived. they are not servers or
535        # agents. for each new run, unless there is a reason to not create a new
536        # CsvPaths instance, we would create a new one.
537        #
538        if name in self.named_results:
539            return self.named_results[name]
540        #
541        # find and load the result, if exists. we find
542        # results home with the name. run_home is the
543        # last run dir. the results we're looking for are
544        # the instance dirs in the run dir.
545        # we'll need another method for getting a specific
546        # run, rather than the default, the last one.
547        #
548        path = os.path.join(self.csvpaths.config.archive_path, name)
549        self.csvpaths.logger.debug(
550            "Attempting to load results for %s from %s", name, path
551        )
552        if Nos(path).dir_exists():
553            runs = Nos(path).listdir()
554            runs.sort()
555            run = runs[len(runs) - 1]
556            rs = self.get_named_results_for_run(name=name, run=run)
557            if rs is not None:
558                return rs
559        #
560        # we treat this as a recoverable error because typically the user
561        # has complete control of the csvpaths environment, making the
562        # problem config that should be addressed.
563        #
564        # if reached by a reference this error should be trapped at an
565        # expression and handled according to the error policy.
566        #
567        msg = (
568            f"Results '{name}' does not exist. Has has that named-paths group been run?"
569        )
570        self.csvpaths.logger.error(msg)
571        if self.csvpaths.ecoms.do_i_raise():
572            raise InputException(msg)
573
574    def get_named_results_for_run(self, *, name: str, run: str) -> list[list[Any]]:
575        path = os.path.join(self.csvpaths.config.archive_path, name)
576        path = os.path.join(path, run)
577        instances = Nos(path).listdir()
578        rs = []
579        for inst in instances:
580            if inst == "manifest.json":
581                continue
582            r = self.get_named_result_for_instance(
583                name=name, run_dir=path, run=run, instance=inst
584            )
585            rs.append(r)
586        return rs
587
588    def get_named_result_for_instance(
589        self, *, name: str, run_dir: str, run: str, instance: str
590    ) -> list[list[Any]]:
591        instance_dir = os.path.join(run_dir, instance)
592        mani = ResultFileReader.manifest(instance_dir)
593        #
594        # csvpath needs to be loaded with all meta.json->metadata and some/most of runtime_data
595        #
596        csvpath = self.csvpaths.csvpath()
597        meta = ResultFileReader.meta(instance_dir)
598        if meta:
599            #
600            # until there's a clear case for more, this is all we're going to load.
601            # for the most part, people should be using the metadata, not digging into
602            # run objects that may not be current. if we really need to recreate the
603            # csvpath perfectly we should probably go back and rethink. maybe pickle?
604            #
605            csvpath.scanner = Scanner(csvpath=csvpath)
606            csvpath.scanner.parse(meta["runtime_data"]["scan_part"])
607            csvpath.metadata = meta["metadata"]
608            csvpath.modes.update()
609            csvpath.identity
610            csvpath.scan = meta["runtime_data"]["scan_part"]
611            csvpath.match = meta["runtime_data"]["match_part"]
612            csvpath.delimiter = meta["runtime_data"]["delimiter"]
613            csvpath.quotechar = meta["runtime_data"]["quotechar"]
614        vars = ResultFileReader.vars(instance_dir)
615        if vars:
616            csvpath.variables = vars
617        #
618        # this may not be complete. let's see if it works or needs more.
619        #
620        r = Result(
621            csvpath=csvpath,
622            paths_name=name,
623            run_dir=run_dir,
624            file_name=mani["actual_data_file"],
625            run_index=mani["instance_index"],
626            run_time=dateutil.parser.parse(mani["time"]),
627            runtime_data=meta["runtime_data"],
628            by_line=not bool(mani["serial"]),
629        )
630        return r
def get_specific_named_result( self, name: str, name_or_id: str) -> csvpath.managers.results.result.Result:
106    def get_specific_named_result(self, name: str, name_or_id: str) -> Result:
107        results = self.get_named_results(name)
108        if results and len(results) > 0:
109            for r in results:
110                if name_or_id == r.csvpath.identity:
111                    return r
112        return None  # pragma: no cover
def get_specific_named_result_manifest(self, name: str, name_or_id: str) -> dict[str, str | bool]:
114    def get_specific_named_result_manifest(
115        self, name: str, name_or_id: str
116    ) -> dict[str, str | bool]:
117        r = self.get_specific_named_result(name, name_or_id)
118        if r is None:
119            return None
120        rs = ResultSerializer(self._csvpaths.config.archive_path)
121        rr = ResultRegistrar(csvpaths=self.csvpaths, result=r, result_serializer=rs)
122        return rr.manifest
def get_last_named_result( self, *, name: str, before: str = None) -> csvpath.managers.results.result.Result:
124    def get_last_named_result(self, *, name: str, before: str = None) -> Result:
125        results = self.get_named_results(name)
126        if results and len(results) > 0:
127            return results[len(results) - 1]
128        return None
def is_valid(self, name: str) -> bool:
130    def is_valid(self, name: str) -> bool:
131        results = self.get_named_results(name)
132        for r in results:
133            if not r.is_valid:
134                return False
135        return True
def get_variables(self, name: str) -> bool:
137    def get_variables(self, name: str) -> bool:
138        results = self.get_named_results(name)
139        vs = {}
140        for r in results:
141            vs = {**r.csvpath.variables, **vs}
142        return vs
def get_number_of_results(self, name: str) -> int:
152    def get_number_of_results(self, name: str) -> int:
153        nr = self.get_named_results(name)
154        if nr is None:
155            return 0
156        return len(nr)
def has_errors(self, name: str) -> bool:
158    def has_errors(self, name: str) -> bool:
159        results = self.get_named_results(name)
160        for r in results:
161            if r.has_errors():
162                return True
163        return False
def get_number_of_errors(self, name: str) -> bool:
165    def get_number_of_errors(self, name: str) -> bool:
166        results = self.get_named_results(name)
167        errors = 0
168        for r in results:
169            errors += r.errors_count()
170        return errors
def list_named_results(self) -> list[str]:
238    def list_named_results(self) -> list[str]:
239        path = self._csvpaths.config.archive_path
240        if Nos(path).dir_exists():
241            names = Nos(path).listdir()
242            names = [n for n in names if not n.startswith(".")]
243            names.sort()
244        else:
245            self._csvpaths.logger.warning(
246                "Archive %s does not exist. If no runs have been attempted yet this is fine.",
247                path,
248            )
249            names = []
250        return names
def get_named_results(self, name) -> List[List[Any]]:
532    def get_named_results(self, name) -> List[List[Any]]:
533        #
534        # CsvPaths instances should not be long lived. they are not servers or
535        # agents. for each new run, unless there is a reason to not create a new
536        # CsvPaths instance, we would create a new one.
537        #
538        if name in self.named_results:
539            return self.named_results[name]
540        #
541        # find and load the result, if exists. we find
542        # results home with the name. run_home is the
543        # last run dir. the results we're looking for are
544        # the instance dirs in the run dir.
545        # we'll need another method for getting a specific
546        # run, rather than the default, the last one.
547        #
548        path = os.path.join(self.csvpaths.config.archive_path, name)
549        self.csvpaths.logger.debug(
550            "Attempting to load results for %s from %s", name, path
551        )
552        if Nos(path).dir_exists():
553            runs = Nos(path).listdir()
554            runs.sort()
555            run = runs[len(runs) - 1]
556            rs = self.get_named_results_for_run(name=name, run=run)
557            if rs is not None:
558                return rs
559        #
560        # we treat this as a recoverable error because typically the user
561        # has complete control of the csvpaths environment, making the
562        # problem config that should be addressed.
563        #
564        # if reached by a reference this error should be trapped at an
565        # expression and handled according to the error policy.
566        #
567        msg = (
568            f"Results '{name}' does not exist. Has has that named-paths group been run?"
569        )
570        self.csvpaths.logger.error(msg)
571        if self.csvpaths.ecoms.do_i_raise():
572            raise InputException(msg)
def get_named_results_for_run(self, *, name: str, run: str) -> list[list[typing.Any]]:
574    def get_named_results_for_run(self, *, name: str, run: str) -> list[list[Any]]:
575        path = os.path.join(self.csvpaths.config.archive_path, name)
576        path = os.path.join(path, run)
577        instances = Nos(path).listdir()
578        rs = []
579        for inst in instances:
580            if inst == "manifest.json":
581                continue
582            r = self.get_named_result_for_instance(
583                name=name, run_dir=path, run=run, instance=inst
584            )
585            rs.append(r)
586        return rs
def get_named_result_for_instance( self, *, name: str, run_dir: str, run: str, instance: str) -> list[list[typing.Any]]:
588    def get_named_result_for_instance(
589        self, *, name: str, run_dir: str, run: str, instance: str
590    ) -> list[list[Any]]:
591        instance_dir = os.path.join(run_dir, instance)
592        mani = ResultFileReader.manifest(instance_dir)
593        #
594        # csvpath needs to be loaded with all meta.json->metadata and some/most of runtime_data
595        #
596        csvpath = self.csvpaths.csvpath()
597        meta = ResultFileReader.meta(instance_dir)
598        if meta:
599            #
600            # until there's a clear case for more, this is all we're going to load.
601            # for the most part, people should be using the metadata, not digging into
602            # run objects that may not be current. if we really need to recreate the
603            # csvpath perfectly we should probably go back and rethink. maybe pickle?
604            #
605            csvpath.scanner = Scanner(csvpath=csvpath)
606            csvpath.scanner.parse(meta["runtime_data"]["scan_part"])
607            csvpath.metadata = meta["metadata"]
608            csvpath.modes.update()
609            csvpath.identity
610            csvpath.scan = meta["runtime_data"]["scan_part"]
611            csvpath.match = meta["runtime_data"]["match_part"]
612            csvpath.delimiter = meta["runtime_data"]["delimiter"]
613            csvpath.quotechar = meta["runtime_data"]["quotechar"]
614        vars = ResultFileReader.vars(instance_dir)
615        if vars:
616            csvpath.variables = vars
617        #
618        # this may not be complete. let's see if it works or needs more.
619        #
620        r = Result(
621            csvpath=csvpath,
622            paths_name=name,
623            run_dir=run_dir,
624            file_name=mani["actual_data_file"],
625            run_index=mani["instance_index"],
626            run_time=dateutil.parser.parse(mani["time"]),
627            runtime_data=meta["runtime_data"],
628            by_line=not bool(mani["serial"]),
629        )
630        return r