rmmhicke
/

t5-literary-coreference

Text2Text Generation

digital humanities

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

t5-literary-coreference / get_ent_clusters.py

rmmhicke's picture

Upload 2 files

0d1a7d5 9 months ago

history blame contribute delete

No virus

2.22 kB

	import pandas as pd
	import os
	import re
	import csv

	def extract_paren(annotation):
	ents = []
	for i in range(len(annotation)):
	if annotation[i] == "[":
	ent = "["
	open_paren = 0

	for j in range(i+1, len(annotation)):
	if annotation[j] == "[":
	open_paren += 1
	elif annotation[j] == "]":
	if open_paren > 0:
	open_paren -= 1
	ent = ent[:len(ent)-3]
	else:

	ent += "]"
	digit = re.search(r": [0-9]{1,3}", ent)

	if digit:
	matches = re.findall(r": [0-9]{1,3}", annotation[:i])
	str_index = annotation[:i].count(" ") - len(matches)
	ent += "\|" + str(str_index)
	ents.append(ent)
	break
	else:
	ent += annotation[j]
	return ents

	def create_clusters(ents):
	clusters = {}

	for e in ents:
	digit_ann = re.search(r": [0-9]{1,3}", e)
	if digit_ann:
	clean_e = e.replace("[", "").replace("]", "").replace(digit_ann.group(), "")

	digit = re.search(r"[0-9]{1,3}", digit_ann.group())
	digit = int(digit.group())

	if digit not in clusters:
	clusters[digit] = []

	clusters[digit].append(clean_e)
	else:
	print("OH NO:", e)
	print()

	return clusters

	headers = ["input", "model_output", "model_output_clusters"]

	df = pd.read_csv("results.csv")

	rows = []
	for index, row in df.iterrows():
	annotation = row["model_output"]

	if isinstance(annotation, str):
	ann_ents = extract_paren(annotation)

	ann_clusters = {}
	if ann_ents:
	ann_clusters = create_clusters(ann_ents)
	else:
	ann_clusters = {}


	new_row = [row["input"], annotation, str(ann_clusters)]
	rows.append(new_row)


	f = open("cluster_results.csv", "w")
	writer = csv.writer(f)
	writer.writerow(headers)
	writer.writerows(rows)
	f.close()