- # coding=utf-8
- import json
- import io
- import re
- ## Black -> White -> ❶ -> (1) (see hon)
- # words to test: are, suru, naru
- def process_kanji (dfn):
- # gets string and returns list of ■一■-level dfns processed
- #re_obj = re.split(ur"■[一二三四五六七八九十]■", dfn)
- re_obj = re.split(r'\n■[一二三四五六七八九]■', dfn, re.UNICODE)
- if(len(re_obj) > 2):
- common = re_obj.pop(0) + "\n"
- re_obj = map (lambda x: common + x, re_obj)
- return re_obj
- def process_white (dfn_lst):
- # gets lst of dfns and returns (possibly bigger) final lst of dfns
- final_lst = []
- for s in dfn_lst:
- re_obj = re.split(r'\n□[一二三四五六七八九]□', s, re.UNICODE)
- if (len(re_obj) > 2):
- common = re_obj.pop(0) + "\n"
- re_obj = map( lambda x: common + x, re_obj)
- final_lst.extend(re_obj)
- return final_lst
- def process_black_ball (dfn_lst):
- # gets lst of dfns and returns (possibly bigger) final lst of dfns
- final_lst = []
- for s in dfn_lst:
- re_obj = re.split(r'\n[❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴]', s, re.UNICODE)
- if (len(re_obj) > 2):
- common = re_obj.pop(0) + "\n"
- re_obj = map( lambda x: common + x, re_obj)
- final_lst.extend(re_obj)
- return final_lst
- def process_numbers (dfn_lst):
- # gets lst of dfns and returns (possibly bigger) final lst of dfns
- final_lst = []
- for s in dfn_lst:
- re_obj = re.split(r'\n([1234567890]+)', s, re.UNICODE)
- if (len(re_obj) > 2):
- common = re_obj.pop(0) + "\n"
- re_obj = map( lambda x: common + x, re_obj)
- final_lst.extend(re_obj)
- return final_lst
- def process_dfn (dfn):
- # receives raw string and returns LIST of strings (one for each definition)
- #dfn = dfn.replace('{(1)}','-- (1) --')
- #dfn = dfn.replace('〔(1)','〔 (1) ')
- # process "■一■" and similar
- dfn = process_kanji (dfn) # returns list of strings
- dfn = process_white (dfn) # returns list of strings
- dfn = process_black_ball(dfn) # returns list of strings
- # process "(1)" and similar
- lst = process_numbers (dfn)
- return lst
- number = 1
- names = []
- for number in range(1,34):
- name = 'term_bank_' + str(number) + '.json'
- names.append(name)
- for file_name in names:
- print(file_name)
- glob = []
- with open(file_name) as json_file:
- data = json.load(json_file)
- for elem in data:
- lst = elem[5]
- lst_dfn = []
- for raw_dfn in lst:
- lst_dfn.extend(process_dfn(raw_dfn))
- for dfn in lst_dfn:
- current_elem = []
- current_elem.append( elem[0] )
- current_elem.append( elem[1] )
- current_elem.append( elem[2] )
- current_elem.append( elem[3] )
- current_elem.append( elem[4] )
- current_elem.append( [dfn] )
- current_elem.append( elem[6] )
- current_elem.append( elem[7] )
- glob.append( current_elem )
- with io.open(str(file_name), 'w', encoding='utf-8') as f:
- json.dump(glob, f, ensure_ascii=False)