This repository was archived by the owner on May 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
162 lines (123 loc) · 4.17 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from bleach import clean
from lxml.html import document_fromstring
import logging
from typing import Any
class Parser:
CATEGORIES = {
'1': 'charityDropOff',
'2': 'accommodation',
'3': 'govermentCharity',
'4': 'psychologicalAssistance',
'5': 'legalAssistance',
'6': 'medicalAssistance',
'7': 'animalAssistance',
'8': 'childcare',
'9': 'transport',
}
@staticmethod
def parse_id(value: Any) -> str:
if not value:
raise ValueError('ID cannot be empty!')
return str(value)
@staticmethod
def parse_category(value: Any) -> str:
if not isinstance(value, list):
raise ValueError(f'Unexpected category data type: {value}')
if len(value) > 1:
raise ValueError(f'Unexpected multiple categories: {value}')
# Currently empty should be handled same as '1'
if len(value) == 0:
return Parser.CATEGORIES['1']
# raise ValueError('Category cannot be empty!')
category_id = value[0]
try:
return Parser.CATEGORIES[category_id]
except Exception:
raise ValueError(f'Unexpected category ID: {category_id}')
@staticmethod
def parse_verified(value: Any) -> bool:
TRUE_VALUES = {
'tak',
'zweryfikowany',
'zweryfikowane',
'zweryfikowana',
'zweryfikowano',
}
FALSE_VALUES = {
'nie',
'niezweryfikowany',
'niezweryfikowana',
'niezweryfikowane',
'niezweryfikowano',
}
# for now empty value is just False
if not value:
return False
# raise ValueError('Verify cannot be empty!')
# Remove all whitespace characters and lower
clean_value = ''.join(value.strip().split()).lower()
if clean_value in TRUE_VALUES:
return True
elif clean_value in FALSE_VALUES:
return False
else:
raise ValueError(f'Unexpected verified value: {value}')
@staticmethod
def parse_lat(value: Any) -> float:
lat = float(value)
if not (45 < lat < 56):
raise ValueError(f'Suspicious latitude: {lat}')
return lat
@staticmethod
def parse_lng(value: Any) -> float:
lng = float(value)
if not (12 < lng < 30):
raise ValueError(f'Suspicious longitude: {lng}')
return lng
@staticmethod
def parse_name(value: Any) -> str:
if not value:
raise ValueError('Name cannot be empty!')
return clean(value, strip=True)
@staticmethod
def parse_description(value: Any) -> str:
if not value:
return None
try:
doc = document_fromstring(value)
# Add new lines to end of div/br elements
for elem in doc.xpath('//*[self::div or self::br]'):
elem.tail = '\n' + elem.tail if elem.tail else '\n'
# Add new line with '-' char to list elements
for li in doc.xpath('*//li'):
li.text = '\n- ' + li.text if li.text else '\n'
value = doc.text_content()
value = value.replace(', -', ',\n-') # fix some lists
except Exception as e:
logging.error(f'Parsing description error: {e}')
pass
return clean(value, strip=True)
@staticmethod
def parse_phone(value: Any) -> str:
if not value:
return None
if str(value).strip().lower() in ('brak', 'nie'):
return None
return clean(value, strip=True)
@staticmethod
def parse_addr(value: Any) -> str:
if not value:
return None
if str(value).strip().lower() in ('brak', 'nie'):
return None
return clean(value, strip=True)
@staticmethod
def parse_website(value: Any) -> str:
# Currently not used
return str(value) if value else None
@staticmethod
def parse_opening_hours(value: Any) -> str:
# Currently not used
if not value:
return None
return clean(value, strip=True)