Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

""" 

Basic parser for Starling data. 

""" 

from __future__ import unicode_literals, division, print_function 

 

from lingpy.read.csv import csv2list 

from lingpy import log 

from lingpy.util import identity 

 

 

def star2qlc(filename, clean_taxnames=False, debug=False): 

""" 

Converts a file directly output from starling to LingPy-QLC format. 

""" 

cleant = clean_taxnames or identity 

data = csv2list(filename) 

 

# check for strange chars in data due to notepad errors 

data[0][0] = data[0][0].replace('\ufeff', '') 

 

# get the header 

header = data[0] 

 

# debugging 

if debug: 

error = False 

log.info("Header line has length {0}.".format(len(header))) 

for line in data[1:]: 

if len(line) != len(header): # pragma: no cover 

log.error("Error for item {0} with length {1}, expected {2}.".format( 

'/'.join(line[0:2]), len(line), len(header))) 

error = True 

if error: # pragma: no cover 

log.error("Errors were found, aborting function call.") 

return 

else: 

log.info("Everything went fine, carrying on with function call.") 

 

# determine language names in header 

taxa = [] 

for i in range(len(header) - 1): 

prev = header[i] 

post = header[i + 1] 

 

if prev in post and '#' in post: 

taxa += [prev] 

 

if len(taxa) == 1: 

lngIdx = i 

 

if prev == 'Number': 

numIdx = i 

 

if prev == 'Word': 

wrdIdx = i 

 

log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx)) 

log.info('starling, taxa: %s' % taxa) 

 

# start filling in the dictionary 

D = {0: [ 

'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID']} 

 

idx = 1 

cognate_counter = 0 

current_concept = '' 

cognate_sets = [] 

for line in data[2:]: 

gloss = line[wrdIdx] 

gnum = line[numIdx] 

 

# switch to next cognate set if there is a switch in concepts 

if current_concept != gloss and len(cognate_sets) != 0: 

max_cog = max(cognate_sets) 

cognate_counter = max_cog 

cognate_sets = [] 

current_concept = gloss 

else: 

log.debug('starling, indices (%s, %s, %s)' % ( 

gloss, current_concept, cognate_counter)) 

 

for i in range(lngIdx, len(header), 2): 

word = line[i] 

 

if '{' in word: 

ipa = word[:word.index('{')].strip() 

ortho = word[word.index('{') + 1:word.index('}')].strip() 

else: 

ipa = word 

ortho = word 

 

cogid = int(line[i + 1]) 

 

if cogid != 0 and word: 

if cogid > 0: 

cogid = cogid + cognate_counter 

 

# append cognate sets, essential for raising the counter 

cognate_sets += [int(cogid)] 

 

taxon = cleant(header[i]) 

 

D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid] 

idx += 1 

 

# re-iterate through data and reassign cognate sets with negative ids 

for k in D: 

if k: 

cogid = D[k][-1] 

if cogid < 0: 

cogid = -cognate_counter 

cognate_counter += 1 

D[k][-1] = cogid 

 

return D