csvpath.managers.files.file_registrar

  1import os
  2import json
  3from datetime import datetime
  4from csvpath.util.exceptions import InputException, FileException
  5from csvpath.util.file_readers import DataFileReader
  6from csvpath.util.file_writers import DataFileWriter
  7from csvpath.util.nos import Nos
  8from csvpath.managers.registrar import Registrar
  9from csvpath.managers.listener import Listener
 10from csvpath.managers.metadata import Metadata
 11
 12
 13class FileRegistrar(Registrar, Listener):
 14    """@private
 15    this file registers the metadata with a tracking system. e.g. an OpenLineage
 16    server, JSON file, or database"""
 17
 18    def __init__(self, csvpaths):
 19        Registrar.__init__(self, csvpaths)
 20        Listener.__init__(self, csvpaths.config)
 21        self.csvpaths = csvpaths
 22        self.config = csvpaths.config
 23        self.type_name = "file"
 24
 25    def get_fingerprint(self, home) -> str:
 26        mpath = self.manifest_path(home)
 27        man = self.get_manifest(mpath)
 28        if man is None or len(man) == 0:
 29            raise FileException(
 30                f"No fingerprint available for named-file name: {home} at manifest path: {mpath}: manifest: {man}"
 31            )
 32        return man[len(man) - 1]["fingerprint"]
 33
 34    def manifest_path(self, home) -> str:
 35        if not Nos(home).dir_exists():
 36            raise InputException(f"Named file home does not exist: {home}")
 37        mf = os.path.join(home, "manifest.json")
 38        if not Nos(mf).exists():
 39            with DataFileWriter(path=mf, mode="w") as writer:
 40                writer.append("[]")
 41        return mf
 42
 43    def get_manifest(self, mpath) -> list:
 44        with DataFileReader(mpath) as reader:
 45            return json.load(reader.source)
 46
 47    def metadata_update(self, mdata: Metadata) -> None:
 48        path = mdata.origin_path
 49        rpath = mdata.file_path
 50        h = mdata.fingerprint
 51        t = mdata.type
 52        mark = mdata.mark
 53        manifest_path = mdata.manifest_path
 54        mani = {}
 55        mani["type"] = t
 56        mani["file"] = rpath
 57        mani["file_home"] = mdata.file_home
 58        mani["fingerprint"] = h
 59        mani["time"] = mdata.time_string
 60        mani["from"] = path
 61        if mark is not None:
 62            mani["mark"] = mark
 63        jdata = self.get_manifest(manifest_path)
 64        jdata.append(mani)
 65        with DataFileWriter(path=manifest_path, mode="w") as writer:
 66            json.dump(jdata, writer.sink, indent=2)
 67        #
 68        # drop update into an all-inputs/files record here?
 69        #
 70
 71    def register_complete(self, mdata: Metadata) -> None:
 72        path = mdata.origin_path
 73        home = mdata.name_home
 74        i = path.find("#")
 75        mark = None
 76        if i > -1:
 77            mark = path[i + 1 :]
 78            path = path[0:i]
 79        if mark != mdata.mark:
 80            raise InputException(
 81                f"File mgr and registrar marks should match: {mdata.mark}, {mark}"
 82            )
 83        if not path.startswith("s3:") and not Nos(path).exists():
 84            # if not path.startswith("s3:") and not os.path.exists(path):
 85            #
 86            # try for a data reader in case we're smart-opening
 87            #
 88            raise InputException(f"Path {path} does not exist")
 89        #
 90        # if the fingerprint already exists we don't store the file again.
 91        # we rename the file to the fingerprint. from this point the registrar
 92        # is responsible for the location of the current version of the file.
 93        # that is approprate because the file manager isn't responsible for
 94        # identification, only divvying up activity between its workers,
 95        # the initial file drop off to them, and responding to external
 96        # requests.
 97        #
 98        # create inputs/named_files/name/manifest.json
 99        # add line in manifest with date->fingerprint->source-location->reg-file-location
100        # return path to current / most recent registered file
101        #
102        mpath = self.manifest_path(home=home)
103        mdata.manifest_path = mpath
104        mdata.type = self._type_from_sourcepath(path)
105        jdata = self.get_manifest(mpath)
106        if len(jdata) > 0:
107            _ = jdata[len(jdata) - 1]
108            # if the fingerprints are the same and we haven't renamed
109            # the file or moved all the files we don't need to reregister
110            # this file. at least that is the thinking today. it is possible
111            # we might want to reregister in the case of a new listener
112            # being added or for some other reason, but not atm.
113            if (
114                "fingerprint" in _
115                and _["fingerprint"] == mdata.fingerprint
116                and "file_home" in _
117                and _["file_home"] == mdata.file_home
118            ):
119                #
120                # leave as info so nobody has to dig to see why no update
121                #
122                self.csvpaths.logger.info("File has already been registered: %s", jdata)
123                return
124        self.distribute_update(mdata)
125
126    def type_of_file(self, home: str) -> str:
127        p = self.manifest_path(home)
128        m = self.get_manifest(p)
129        return m[len(m) - 1]["type"]
130
131    def _type_from_sourcepath(self, sourcepath: str) -> str:
132        i = sourcepath.rfind(".")
133        t = "Unknown type"
134        if i > -1:
135            t = sourcepath[i + 1 :]
136        i = t.find("#")
137        if i > -1:
138            t = t[0:i]
139        return t
140
141    def registered_file(self, home: str) -> str:
142        mpath = self.manifest_path(home)
143        with DataFileReader(mpath) as reader:
144            mdata = json.load(reader.source)
145            if mdata is None or len(mdata) == 0:
146                raise InputException(f"Manifest for {home} at {mpath} is empty")
147            m = mdata[len(mdata) - 1]
148            if "file" not in m:
149                raise ValueError(
150                    "File path cannot be None. Check your config file and named-files."
151                )
152            path = m["file"]
153            mark = None
154            if "mark" in m:
155                mark = m["mark"]
156            if mark is not None:
157                path = f"{path}#{mark}"
158            return path