import os, json class JIS_automaton: def __init__(self, dict_path, init_mode=3): #0-XX ISO/IEC 646 IRV ('1b 28 42' で指定) #3-XXXX JIS X 0213:2004 平面 1 ('1b 24 28 51' で指定) #4-XXXX JIS X 0213:2000 平面 2 ('1b 24 28 50' で指定) self.mode = init_mode #初期モード self.two_byte_mode = True if init_mode > 0 else False self.charset_dict = dict() with open(dict_path, mode="r", encoding="utf-8") as fr: self.charset_dict = json.loads(fr.read()) def change_mode(self, hex_list, pointer): #コードモード変更ポイント関数 change_flag = True if len(hex_list[pointer:]) >= 3 and hex_list[pointer] == "1B" and hex_list[pointer+1] == "28" and hex_list[pointer+2] == "42": #0 self.mode = 0 plus_point = 3 elif len(hex_list[pointer:]) >= 4 and hex_list[pointer] == "1B" and hex_list[pointer+1] == "24" and hex_list[pointer+2] == "28" and hex_list[pointer+2] == "51": #3 self.mode = 3 plus_point = 4 elif len(hex_list[pointer:]) >= 4 and hex_list[pointer] == "1B" and hex_list[pointer+1] == "24" and hex_list[pointer+2] == "28" and hex_list[pointer+2] == "50": #4 self.mode = 4 plus_point = 4 else: change_flag = False plus_point = 0 return (change_flag, plus_point) def convert_unicode_to_char(self, unicode_one): #Unicode変換関数 return chr(int(unicode_one.lstrip("U+"), 16)) def scan(self, hex_list): pointer = 0 return_list = list() while len(hex_list) > pointer: change_flag, plus_point = self.change_mode(hex_list, pointer) pointer += plus_point buffer = None if self.two_byte_mode: buffer = self.charset_dict["{0}/{1}{2}".format(self.mode, hex_list[pointer], hex_list[pointer+1])]["Unicode"] buffer = self.convert_unicode_to_char(buffer) pointer += 2 else: buffer = self.charset_dict["{0}/{1}".format(self.mode, hex_list[pointer])]["Unicode"] buffer = self.convert_unicode_to_char(buffer) pointer += 1 if buffer is not None: return_list.append(buffer) return return_list def scans(self, hex_text): return "".join(self.scan([hex_text[i:i+2] for i in range(0, len(hex_text), 2)])) if "__main__" == __name__: os.chdir(os.path.dirname(os.path.abspath(__file__))) automaton = JIS_automaton("jis0208.json", init_mode=3) #http://charset.7jp.net/jis0208.html sample = "1B24423B64244F25512573254024472422246B21233A7B244B244F31494D5C242C4C35242421231B2842" #"私はパンダである。笹には栄養が無い。".encode("iso2022_jp_2004").hex().upper() sample = "BBE4A4CFA5D1A5F3A5C0A4C7A4A2A4EBA1A3BAFBA4CBA4CFB1C9CDDCA4ACCCB5A4A4A1A3" #"私はパンダである。笹には栄養が無い。".encode("euc_jis_2004").hex().upper() sample = "8E8482CD83708393835F82C582A082E981428DF982C982CD8968977B82AA96B382A28142" #"私はパンダである。笹には栄養が無い。".encode("shift-jis").hex().upper() sample = "9B4F82F18A538A5F8A9A82F382A082B181A08D6282F982F18B958C5182C98C5582A481A0" #JIS X 0208 で「私はパンダである。笹には栄養が無い。」 print(len(sample)) automaton.scans(sample) print("{0}: {1}".format(sample, automaton.scans(sample)))