Coverage for /home/mattis/projects/websites/dighl/edictor/src/edictor/wordlist.py: 63%
46 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-07 06:52 +0200
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-07 06:52 +0200
1"""
2Handle EDICTOR wordlist data.
3"""
4import urllib
5import tempfile
6try:
7 import lingpy
8except ImportError:
9 lingpy = False
10try:
11 from lexibase import LexiBase
12except ImportError:
13 LexiBase = False
16# noinspection HttpUrlsUsage
17def fetch_wordlist(
18 dataset,
19 remote_dbase=None,
20 concepts=None,
21 languages=None,
22 columns=None,
23 to_lingpy=None,
24 transform=None,
25 base_url="http://lingulist.de/edictor",
26):
27 """
28 Download wordlist from an EDICTOR server application.
29 """
30 url = base_url + "/triples/get_data.py?file=" + dataset
31 if not remote_dbase:
32 url += "&remote_dbase=" + dataset + ".sqlite3"
33 else:
34 url += "&remote_dbase=" + remote_dbase
35 if concepts:
36 url += "&concepts=" + "|".join([urllib.parse.quote(c) for c in concepts])
37 if languages:
38 url += "&doculects=" + "|".join([urllib.parse.quote(c) for c in languages])
39 if columns:
40 url += "&columns=" + "|".join(columns)
42 data = urllib.request.urlopen(url).read()
43 if to_lingpy:
44 if not lingpy:
45 raise ValueError(
46 "Package lingpy has to be installed to use this method.")
47 with tempfile.NamedTemporaryFile() as tf:
48 tf.write(data)
49 tf.flush()
50 return transform(tf.name) if transform else lingpy.Wordlist(tf.name)
51 return data.decode("utf-8")
54def get_wordlist(
55 path,
56 name,
57 columns=None,
58 preprocessing=None,
59 namespace=None,
60 lexibase=False,
61 custom_args=None
62):
63 """
64 Function retrieves a wordlist from a CLDF dataset.
65 """
66 if not lingpy:
67 raise ValueError(
68 "Package lingpy has to be installed to use this method.")
70 wordlist = lingpy.Wordlist.from_cldf(
71 path,
72 columns=columns or (
73 "language_id", "concept_name", "value", "form", "segments", "comment"),
74 namespace=namespace or dict(
75 [
76 ("language_id", "doculect"),
77 ("concept_name", "concept"),
78 ("value", "value"),
79 ("form", "form"),
80 ("segments", "tokens"),
81 ("comment", "note"),
82 ]
83 ),
84 )
86 if preprocessing and custom_args:
87 dct = preprocessing(wordlist, args=custom_args)
88 elif preprocessing:
89 dct = preprocessing(wordlist)
90 else:
91 dct = {idx: wordlist[idx] for idx in wordlist}
92 dct[0] = wordlist.columns
94 if not lexibase:
95 lingpy.Wordlist(dct).output("tsv", filename=name, ignore="all", prettify=False)
96 else:
97 if not LexiBase:
98 raise ValueError(
99 "Package lexibase has to be installed to use this method.")
100 lex = LexiBase(dct, dbase=name + ".sqlite3")
101 lex.create(name)