-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_trans.py
55 lines (50 loc) · 1.09 KB
/
dataset_trans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
pre_path = './msra_test_bio'
nex_path = 'anno.json'
data = []
with open(pre_path, 'r', encoding='utf-8') as f:
buff = {
'sen':'',
'tag':[]
}
tag_type = ''
start = 0; end = 0
# point = 0
for line in f.readlines():
line_buff = line.replace('\n', '').split(' ')
# print(len(line_buff))
# point +=1
# if(point>100): break
if len(line_buff) == 1:
data.append(buff)
buff = {
'sen':'',
'tag':[]
}
elif len(line_buff) == 2:
buff['sen'] = buff['sen'] + line_buff[0]
tag = line_buff[1].split('-')
# print(tag)
if tag[0] == 'O':
if tag_type != '':
end = len(buff['sen'])-1
buff['tag'].append({
'type': tag_type,
'start': start,
'end': end
})
tag_type = ''
elif tag[0] == 'B':
if tag_type != '':
end = len(buff['sen'])-1
buff['tag'].append({
'type': tag_type,
'start': start,
'end': end
})
tag_type = tag[1]
start = len(buff['sen'])-1
elif tag[0] == 'I':
pass
with open(nex_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)