# coding=utf-8 import json import io import re ## Black -> White -> ❶ -> (1) (see hon) # words to test: are, suru, naru def process_kanji (dfn): # gets string and returns list of ■一■-level dfns processed #re_obj = re.split(ur"■[一二三四五六七八九十]■", dfn) re_obj = re.split(r'\n■[一二三四五六七八九]■', dfn, re.UNICODE) if(len(re_obj) > 2): common = re_obj.pop(0) + "\n" re_obj = map (lambda x: common + x, re_obj) return re_obj def process_white (dfn_lst): # gets lst of dfns and returns (possibly bigger) final lst of dfns final_lst = [] for s in dfn_lst: re_obj = re.split(r'\n□[一二三四五六七八九]□', s, re.UNICODE) if (len(re_obj) > 2): common = re_obj.pop(0) + "\n" re_obj = map( lambda x: common + x, re_obj) final_lst.extend(re_obj) return final_lst def process_black_ball (dfn_lst): # gets lst of dfns and returns (possibly bigger) final lst of dfns final_lst = [] for s in dfn_lst: re_obj = re.split(r'\n[❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴]', s, re.UNICODE) if (len(re_obj) > 2): common = re_obj.pop(0) + "\n" re_obj = map( lambda x: common + x, re_obj) final_lst.extend(re_obj) return final_lst def process_numbers (dfn_lst): # gets lst of dfns and returns (possibly bigger) final lst of dfns final_lst = [] for s in dfn_lst: re_obj = re.split(r'\n([1234567890]+)', s, re.UNICODE) if (len(re_obj) > 2): common = re_obj.pop(0) + "\n" re_obj = map( lambda x: common + x, re_obj) final_lst.extend(re_obj) return final_lst def process_dfn (dfn): # receives raw string and returns LIST of strings (one for each definition) #dfn = dfn.replace('{(1)}','-- (1) --') #dfn = dfn.replace('〔(1)','〔 (1) ') # process "■一■" and similar dfn = process_kanji (dfn) # returns list of strings dfn = process_white (dfn) # returns list of strings dfn = process_black_ball(dfn) # returns list of strings # process "(1)" and similar lst = process_numbers (dfn) return lst number = 1 names = [] for number in range(1,34): name = 'term_bank_' + str(number) + '.json' names.append(name) for file_name in names: print(file_name) glob = [] with open(file_name) as json_file: data = json.load(json_file) for elem in data: lst = elem[5] lst_dfn = [] for raw_dfn in lst: lst_dfn.extend(process_dfn(raw_dfn)) for dfn in lst_dfn: current_elem = [] current_elem.append( elem[0] ) current_elem.append( elem[1] ) current_elem.append( elem[2] ) current_elem.append( elem[3] ) current_elem.append( elem[4] ) current_elem.append( [dfn] ) current_elem.append( elem[6] ) current_elem.append( elem[7] ) glob.append( current_elem ) with io.open(str(file_name), 'w', encoding='utf-8') as f: json.dump(glob, f, ensure_ascii=False)