for line in text[:-1]: if line == 'EOS': result.append(morphs) morphs = [] sentence_id += 1 chunk_id = 0 srcs = [[]]
elif line[0] == '*': if chunk: chunks.append(chunk) dst = int(line.split()[2][:-1]) diff = dst + 1- len(srcs) ex = [[] for _ in range(diff)] srcs.extend(ex) if dst!=-1: srcs[dst].append(chunk_id) chunk = Chunk(sentence_id, chunk_id, dst, srcs) chunk_id += 1 else: ls = line.split('\t') d = {} tmp = ls[1].split(',') morph = Morph(ls[0],tmp[6],tmp[0],tmp[1]) morphs.append(morph) chunk.morphs.append(morph) else: chunks.append(chunk)
sentences = [[] for _ in range(len(chunks))] for chunk in chunks: morphs = '' for morph in chunk.morphs: morphs += morph.surface sentences[chunk.sentence_id].append(morphs)
dsts = [[] for _ in range(len(chunks))] for chunk in chunks: dst = sentences[chunk.sentence_id][chunk.dst] dsts[chunk.sentence_id].append(dst) sentences = list(itertools.chain.from_iterable(sentences)) dsts = list(itertools.chain.from_iterable(dsts)) for i, (s, d) in enumerate(zip(sentences, dsts)): s = s.replace(" ","").replace("。","").replace("、","") d = d.replace(" ","").replace("。","").replace("、","") if s==d or s==''or d=='': continue print (s,d)
for line in text[:-1]: if line == 'EOS': result.append(morphs) morphs = [] sentence_id += 1 chunk_id = 0 srcs = [[]]
elif line[0] == '*': if chunk: chunks.append(chunk) dst = int(line.split()[2][:-1]) diff = dst + 1- len(srcs) ex = [[] for _ in range(diff)] srcs.extend(ex) if dst!=-1: srcs[dst].append(chunk_id) chunk = Chunk(sentence_id, chunk_id, dst, srcs) chunk_id += 1 else: ls = line.split('\t') d = {} tmp = ls[1].split(',') morph = Morph(ls[0],tmp[6],tmp[0],tmp[1]) morphs.append(morph) chunk.morphs.append(morph) else: chunks.append(chunk)
sentences = [[] for _ in range(len(chunks))] for chunk in chunks: for morph in chunk.morphs: chunk.surfaces += morph.surface if morph.pos == '動詞': chunk.has_verb = True elif morph.pos == '名詞': chunk.has_noun = True
sentences[chunk.sentence_id].append(chunk)
dsts = [[] for _ in range(len(chunks))] for chunk in chunks: dst = sentences[chunk.sentence_id][chunk.dst] dsts[chunk.sentence_id].append(dst) sentences = list(itertools.chain.from_iterable(sentences)) dsts = list(itertools.chain.from_iterable(dsts)) for i, (sentence, dst) in enumerate(zip(sentences, dsts)): if sentence.has_noun and dst.has_verb: s = sentence.surfaces d = dst.surfaces
s = s.replace(" ","").replace("。","").replace("、","") d = d.replace(" ","").replace("。","").replace("、","") if s==d or s==''or d=='': continue print (s,d)
for line in text[:-1]: if line == 'EOS': result.append(morphs) morphs = [] sentence_id += 1 chunk_id = 0 srcs = [[]]
elif line[0] == '*': if chunk: chunks.append(chunk) dst = int(line.split()[2][:-1]) diff = dst + 1- len(srcs) ex = [[] for _ in range(diff)] srcs.extend(ex) if dst!=-1: srcs[dst].append(chunk_id) chunk = Chunk(sentence_id, chunk_id, dst, srcs) chunk_id += 1 else: ls = line.split('\t') d = {} tmp = ls[1].split(',') morph = Morph(ls[0],tmp[6],tmp[0],tmp[1]) morphs.append(morph) chunk.morphs.append(morph) else: chunks.append(chunk)
sentences = [[] for _ in range(len(chunks))] for chunk in chunks: for morph in chunk.morphs: chunk.surfaces += morph.surface if morph.pos == '動詞': if chunk.has_verb == False: chunk.first_verb = morph.base chunk.has_verb = True elif morph.pos == '名詞': chunk.has_noun = True elif morph.pos == '助詞': chunk.has_particle = True chunk.particle.append(morph.surface)
sentences[chunk.sentence_id].append(chunk)
dsts = [[] for _ in range(len(chunks))] for chunk in chunks: dst = sentences[chunk.sentence_id][chunk.dst] dsts[chunk.sentence_id].append(dst) with open('45.txt', mode='w') as f: for i,(sentence,dst) in enumerate(zip(sentences,dsts)): dic = {} for s,d in zip(sentence,dst): if s.particle and d.first_verb: old = dic.get(d.first_verb, []) dic[d.first_verb] = old + s.particle
for k,v in dic.items(): output = k+'\t'+" ".join(sorted(v))+'\n' f.write(output)
for line in text[:-1]: if line == 'EOS': result.append(morphs) morphs = [] sentence_id += 1 chunk_id = 0 srcs = [[]]
elif line[0] == '*': if chunk: chunks.append(chunk) dst = int(line.split()[2][:-1]) diff = dst + 1- len(srcs) ex = [[] for _ in range(diff)] srcs.extend(ex) if dst!=-1: srcs[dst].append(chunk_id) chunk = Chunk(sentence_id, chunk_id, dst, srcs) chunk_id += 1 else: ls = line.split('\t') d = {} tmp = ls[1].split(',') morph = Morph(ls[0],tmp[6],tmp[0],tmp[1]) morphs.append(morph) chunk.morphs.append(morph) else: chunks.append(chunk)
sentences = [[] for _ in range(len(chunks))] for chunk in chunks: for morph in chunk.morphs: chunk.surfaces += morph.surface if morph.pos == '動詞': if chunk.has_verb == False: chunk.first_verb = morph.base chunk.has_verb = True elif morph.pos == '名詞': chunk.has_noun = True elif morph.pos == '助詞': chunk.has_particle = True chunk.particle.append(morph.surface)
sentences[chunk.sentence_id].append(chunk)
dsts = [[] for _ in range(len(chunks))] for chunk in chunks: dst = sentences[chunk.sentence_id][chunk.dst] dsts[chunk.sentence_id].append(dst) with open('46.txt', mode='w') as f: for i,(sentence,dst) in enumerate(zip(sentences,dsts)): dic = {}
for s,d in zip(sentence,dst): if s.particle and d.first_verb: old = dic.get(d.first_verb, []) surfaces = s.surfaces.replace(" ","").replace("。","").replace("、","") for p in s.particle: dic[d.first_verb] = old + [[p, surfaces]] for k,v in dic.items(): ls = sorted(v) ls = list(zip(*ls)) output = k+'\t'+" ".join(ls[0])+'\t'+" ".join(ls[1])+'\n' f.write(output)
dsts = [[] for _ in range(len(chunks))] for chunk in chunks: dst = sentences[chunk.sentence_id][chunk.dst] dsts[chunk.sentence_id].append(dst) with open('47.txt', mode='w') as f: for i,(sentence,dst) in enumerate(zip(sentences,dsts)): dic = {}
for s,d in zip(sentence,dst): if s.particle and d.first_verb: old = dic.get(d.first_verb, []) surfaces = s.surfaces.replace(" ","").replace("。","").replace("、","") for p in s.particle: dic[d.first_verb] = old + [[p, surfaces]] for k,v in dic.items(): ls = sorted(v) ls = list(zip(*ls)) output = k+'\t'+" ".join(ls[0])+'\t'+" ".join(ls[1])+'\n' print (output) f.write(output)
defshow_morphs(self): morphs = '' for morph in self.morphs: morphs += morph.surface print ("morphs:",morphs) defshow_chunk_id(self): print ("==========") print ("chunk_id:",self.chunk_id) defshow_sentence_id(self): if (self.chunk_id == 0): print ("====================") print ("sentence_id:",self.sentence_id) defshow_dst(self): print ("dst:",self.dst) defshow_srcs(self): print ("srcs:",self.srcs[self.chunk_id]) path = 'neko.txt.cabocha' with open(path) as f: text = f.read().split('\n') result = [] morphs = [] chunks = [] srcs = [[]] chunk = None sentence_id = 0 chunk_id = 0
for line in text[:-1]: if line == 'EOS': result.append(morphs) morphs = [] sentence_id += 1 chunk_id = 0 srcs = [[]]
elif line[0] == '*': if chunk: chunks.append(chunk) dst = int(line.split()[2][:-1]) diff = dst + 1- len(srcs) ex = [[] for _ in range(diff)] srcs.extend(ex) if dst!=-1: srcs[dst].append(chunk_id) chunk = Chunk(sentence_id, chunk_id, dst, srcs) chunk_id += 1 else: ls = line.split('\t') d = {} tmp = ls[1].split(',') morph = Morph(ls[0],tmp[6],tmp[0],tmp[1]) morphs.append(morph) chunk.morphs.append(morph) else: chunks.append(chunk)
sentences = [[] for _ in range(len(chunks))] for chunk in chunks: morphs = '' for morph in chunk.morphs: morphs += morph.surface if morph.pos == '名詞': chunk.has_noun = True sentences[chunk.sentence_id].append([morphs,chunk.dst,chunk.has_noun])
defrec(sentence,d,ans): if d == -1: return ans else: return rec(sentence,sentence[d][1],ans+' -> '+sentence[d][0])
with open('48.txt', mode='w') as f: for i, sentence in enumerate(sentences): for s,d,has_noun in sentence: if has_noun: ans = rec(sentence,d,s) ans = ans.replace(" ","").replace("。","").replace("、","") print (ans) f.write(ans+'\n')
defshow_morphs(self): morphs = '' for morph in self.morphs: morphs += morph.surface print ("morphs:",morphs) defshow_chunk_id(self): print ("==========") print ("chunk_id:",self.chunk_id) defshow_sentence_id(self): if (self.chunk_id == 0): print ("====================") print ("sentence_id:",self.sentence_id) defshow_dst(self): print ("dst:",self.dst) defshow_srcs(self): print ("srcs:",self.srcs[self.chunk_id]) path = 'neko.txt.cabocha' with open(path) as f: text = f.read().split('\n') result = [] morphs = [] chunks = [] srcs = [[]] chunk = None sentence_id = 0 chunk_id = 0
for line in text[:-1]: if line == 'EOS': result.append(morphs) morphs = [] sentence_id += 1 chunk_id = 0 srcs = [[]]
elif line[0] == '*': if chunk: chunks.append(chunk) dst = int(line.split()[2][:-1]) diff = dst + 1- len(srcs) ex = [[] for _ in range(diff)] srcs.extend(ex) if dst!=-1: srcs[dst].append(chunk_id) chunk = Chunk(sentence_id, chunk_id, dst, srcs) chunk_id += 1 else: ls = line.split('\t') d = {} tmp = ls[1].split(',') morph = Morph(ls[0],tmp[6],tmp[0],tmp[1]) morphs.append(morph) chunk.morphs.append(morph) else: chunks.append(chunk)
sentences = [[] for _ in range(len(chunks))] for chunk in chunks: morphs = '' for morph in chunk.morphs: if morph.pos == '名詞': chunk.has_noun = True chunk.first_noun = morph.surface morphs += morph.surface sentences[chunk.sentence_id].append([morphs,chunk.dst,chunk.has_noun,chunk.first_noun])
defrec(sentence,d,ans): if d == -1: return ans else: return rec(sentence,sentence[d][1],ans+' -> '+sentence[d][0]) defget_path(d,ls): ls += [d] if d == -1: return ls else: return get_path(sentence[d][1],ls)
defgen_arrow(path, xy): new = [] for i,node in enumerate(path): if node == match: break morph = sentence[node][0] if i==0: morph = morph.replace(sentence[node][3], xy) new.append(morph) return' -> '.join(new) with open('49.txt', mode='w') as f: for i, sentence in enumerate(sentences): # if i!=7: # continue ls_path = [] for j,(s,d,has_noun,first_noun) in enumerate(sentence): if has_noun: ans = rec(sentence,d,s) ans = ans.replace(" ","").replace("。","").replace("、","") ls_path.append(get_path(d,[j]))
for pi, pj in itertools.combinations(ls_path,2): for ni in pi: if ni in pj: match = ni break ans = '' ans += gen_arrow(pi, 'X') ans += ' | ' ans += gen_arrow(pj, 'Y') ans += ' | '
if'| |'in ans: ans = ans.replace('| |','->') ans += 'Y' else: ans += sentence[match][0] ans = ans.replace(" ","").replace("。","").replace("、","")