Facebook
From Silly Treeshrew, 3 Years ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 117
  1. # coding=utf-8
  2. import json
  3. import io
  4. import re
  5.  
  6.  
  7. ## Black -> White -> ❶ -> (1) (see hon)
  8.  
  9. # words to test: are, suru, naru
  10.  
  11. def process_kanji (dfn):
  12.     # gets string and returns list of ■一■-level dfns processed
  13.     #re_obj = re.split(ur"■[一二三四五六七八九十]■", dfn)
  14.     re_obj = re.split(r'\n■[一二三四五六七八九]■', dfn, re.UNICODE)
  15.     if(len(re_obj) > 2):
  16.         common = re_obj.pop(0) + "\n"
  17.         re_obj = map (lambda x: common + x, re_obj)
  18.  
  19.     return re_obj
  20.  
  21. def process_white (dfn_lst):
  22.     # gets lst of dfns and returns (possibly bigger) final lst of dfns
  23.     final_lst = []
  24.     for s in dfn_lst:
  25.         re_obj = re.split(r'\n□[一二三四五六七八九]□', s, re.UNICODE)
  26.         if (len(re_obj) > 2):
  27.             common = re_obj.pop(0) + "\n"
  28.             re_obj = map( lambda x: common + x, re_obj)
  29.         final_lst.extend(re_obj)
  30.     return final_lst
  31.  
  32. def process_black_ball (dfn_lst):
  33.     # gets lst of dfns and returns (possibly bigger) final lst of dfns
  34.     final_lst = []
  35.     for s in dfn_lst:
  36.         re_obj = re.split(r'\n[❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴]', s, re.UNICODE)
  37.         if (len(re_obj) > 2):
  38.             common = re_obj.pop(0) + "\n"
  39.             re_obj = map( lambda x: common + x, re_obj)
  40.         final_lst.extend(re_obj)
  41.     return final_lst
  42.  
  43. def process_numbers (dfn_lst):
  44.     # gets lst of dfns and returns (possibly bigger) final lst of dfns
  45.     final_lst = []
  46.     for s in dfn_lst:
  47.         re_obj = re.split(r'\n([1234567890]+)', s, re.UNICODE)
  48.         if (len(re_obj) > 2):
  49.             common = re_obj.pop(0) + "\n"
  50.             re_obj = map( lambda x: common + x, re_obj)
  51.         final_lst.extend(re_obj)
  52.     return final_lst
  53.  
  54.  
  55.  
  56. def process_dfn (dfn):
  57.     # receives raw string and returns LIST of strings (one for each definition)
  58.     #dfn = dfn.replace('{(1)}','-- (1) --')
  59.     #dfn = dfn.replace('〔(1)','〔 (1) ')
  60.  
  61.     # process "■一■" and similar
  62.     dfn = process_kanji (dfn) # returns list of strings
  63.  
  64.     dfn = process_white (dfn) # returns list of strings
  65.  
  66.     dfn = process_black_ball(dfn) # returns list of strings
  67.    
  68.     # process "(1)" and similar
  69.     lst = process_numbers (dfn)
  70.  
  71.     return lst
  72.  
  73.  
  74. number = 1
  75.  
  76. names = []
  77.  
  78. for number in range(1,34):
  79.         name = 'term_bank_' + str(number) + '.json'
  80.         names.append(name)
  81.  
  82.  
  83. for file_name in names:
  84.     print(file_name)
  85.  
  86.     glob = []
  87.  
  88.     with open(file_name) as json_file:
  89.         data = json.load(json_file)
  90.  
  91.     for elem in data:
  92.         lst = elem[5]
  93.  
  94.         lst_dfn = []
  95.  
  96.         for raw_dfn in lst:
  97.             lst_dfn.extend(process_dfn(raw_dfn))
  98.  
  99.         for dfn in lst_dfn:
  100.             current_elem = []
  101.  
  102.             current_elem.append( elem[0] )
  103.             current_elem.append( elem[1] )
  104.             current_elem.append( elem[2] )
  105.             current_elem.append( elem[3] )
  106.             current_elem.append( elem[4] )
  107.             current_elem.append( [dfn] )
  108.             current_elem.append( elem[6] )
  109.             current_elem.append( elem[7] )
  110.  
  111.             glob.append( current_elem )
  112.  
  113.  
  114.     with io.open(str(file_name), 'w', encoding='utf-8') as f:
  115.         json.dump(glob, f, ensure_ascii=False)
  116.