1import os
2import json
3from datetime import datetime
4from csvpath.util.exceptions import InputException, FileException
5from csvpath.util.file_readers import DataFileReader
6from csvpath.util.file_writers import DataFileWriter
7from csvpath.util.nos import Nos
8from csvpath.managers.registrar import Registrar
9from csvpath.managers.listener import Listener
10from csvpath.managers.metadata import Metadata
11
12
13class FileRegistrar(Registrar, Listener):
14 """@private
15 this file registers the metadata with a tracking system. e.g. an OpenLineage
16 server, JSON file, or database"""
17
18 def __init__(self, csvpaths):
19 Registrar.__init__(self, csvpaths)
20 Listener.__init__(self, csvpaths.config)
21 self.csvpaths = csvpaths
22 self.config = csvpaths.config
23 self.type_name = "file"
24
25 def get_fingerprint(self, home) -> str:
26 mpath = self.manifest_path(home)
27 man = self.get_manifest(mpath)
28 if man is None or len(man) == 0:
29 raise FileException(
30 f"No fingerprint available for named-file name: {home} at manifest path: {mpath}: manifest: {man}"
31 )
32 return man[len(man) - 1]["fingerprint"]
33
34 def manifest_path(self, home) -> str:
35 if not Nos(home).dir_exists():
36 raise InputException(f"Named file home does not exist: {home}")
37 mf = os.path.join(home, "manifest.json")
38 if not Nos(mf).exists():
39 with DataFileWriter(path=mf, mode="w") as writer:
40 writer.append("[]")
41 return mf
42
43 def get_manifest(self, mpath) -> list:
44 with DataFileReader(mpath) as reader:
45 return json.load(reader.source)
46
47 def metadata_update(self, mdata: Metadata) -> None:
48 path = mdata.origin_path
49 rpath = mdata.file_path
50 h = mdata.fingerprint
51 t = mdata.type
52 mark = mdata.mark
53 manifest_path = mdata.manifest_path
54 mani = {}
55 mani["type"] = t
56 mani["file"] = rpath
57 mani["file_home"] = mdata.file_home
58 mani["fingerprint"] = h
59 mani["time"] = mdata.time_string
60 mani["from"] = path
61 if mark is not None:
62 mani["mark"] = mark
63 jdata = self.get_manifest(manifest_path)
64 jdata.append(mani)
65 with DataFileWriter(path=manifest_path, mode="w") as writer:
66 json.dump(jdata, writer.sink, indent=2)
67 #
68 # drop update into an all-inputs/files record here?
69 #
70
71 def register_complete(self, mdata: Metadata) -> None:
72 path = mdata.origin_path
73 home = mdata.name_home
74 i = path.find("#")
75 mark = None
76 if i > -1:
77 mark = path[i + 1 :]
78 path = path[0:i]
79 if mark != mdata.mark:
80 raise InputException(
81 f"File mgr and registrar marks should match: {mdata.mark}, {mark}"
82 )
83 if not path.startswith("s3:") and not Nos(path).exists():
84 # if not path.startswith("s3:") and not os.path.exists(path):
85 #
86 # try for a data reader in case we're smart-opening
87 #
88 raise InputException(f"Path {path} does not exist")
89 #
90 # if the fingerprint already exists we don't store the file again.
91 # we rename the file to the fingerprint. from this point the registrar
92 # is responsible for the location of the current version of the file.
93 # that is approprate because the file manager isn't responsible for
94 # identification, only divvying up activity between its workers,
95 # the initial file drop off to them, and responding to external
96 # requests.
97 #
98 # create inputs/named_files/name/manifest.json
99 # add line in manifest with date->fingerprint->source-location->reg-file-location
100 # return path to current / most recent registered file
101 #
102 mpath = self.manifest_path(home=home)
103 mdata.manifest_path = mpath
104 mdata.type = self._type_from_sourcepath(path)
105 jdata = self.get_manifest(mpath)
106 if len(jdata) > 0:
107 _ = jdata[len(jdata) - 1]
108 # if the fingerprints are the same and we haven't renamed
109 # the file or moved all the files we don't need to reregister
110 # this file. at least that is the thinking today. it is possible
111 # we might want to reregister in the case of a new listener
112 # being added or for some other reason, but not atm.
113 if (
114 "fingerprint" in _
115 and _["fingerprint"] == mdata.fingerprint
116 and "file_home" in _
117 and _["file_home"] == mdata.file_home
118 ):
119 #
120 # leave as info so nobody has to dig to see why no update
121 #
122 self.csvpaths.logger.info("File has already been registered: %s", jdata)
123 return
124 self.distribute_update(mdata)
125
126 def type_of_file(self, home: str) -> str:
127 p = self.manifest_path(home)
128 m = self.get_manifest(p)
129 return m[len(m) - 1]["type"]
130
131 def _type_from_sourcepath(self, sourcepath: str) -> str:
132 i = sourcepath.rfind(".")
133 t = "Unknown type"
134 if i > -1:
135 t = sourcepath[i + 1 :]
136 i = t.find("#")
137 if i > -1:
138 t = t[0:i]
139 return t
140
141 def registered_file(self, home: str) -> str:
142 mpath = self.manifest_path(home)
143 with DataFileReader(mpath) as reader:
144 mdata = json.load(reader.source)
145 if mdata is None or len(mdata) == 0:
146 raise InputException(f"Manifest for {home} at {mpath} is empty")
147 m = mdata[len(mdata) - 1]
148 if "file" not in m:
149 raise ValueError(
150 "File path cannot be None. Check your config file and named-files."
151 )
152 path = m["file"]
153 mark = None
154 if "mark" in m:
155 mark = m["mark"]
156 if mark is not None:
157 path = f"{path}#{mark}"
158 return path