hse-python-assistant/app/utils/submit.py

49 lines
1.7 KiB
Python
Raw Normal View History

2024-10-16 20:24:08 +00:00
from typing import Callable
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
print("Loading models...", end="")
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
print("OK")
def get_sentence_embedding(sentence: str) -> torch.Tensor:
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :].squeeze()
return embedding
def string2embedding(string: str) -> torch.Tensor:
return torch.Tensor([float(i) for i in string.split()])
def embedding2string(embedding: torch.Tensor) -> str:
return " ".join([str(i) for i in embedding.tolist()])
2024-10-17 07:24:52 +00:00
def generate_submit(tests_path: str, predict_func: Callable, save_path: str, use_tqdm: bool = True) -> None:
tests = pd.read_excel(tests_path)
bar = range(len(tests))
2024-10-16 20:24:08 +00:00
if use_tqdm:
import tqdm
bar = tqdm.tqdm(bar, desc="Predicting")
submit_df = pd.DataFrame(columns=["solution_id", "author_comment", "author_comment_embedding"])
for i in bar:
2024-10-17 07:24:52 +00:00
idx = tests.index[i]
solution_row = tests.iloc[i]
2024-10-16 20:24:08 +00:00
2024-10-17 21:07:19 +00:00
input_text = f"Вводные данные:\n{solution_row['description']}\n\nКод студента:\n{solution_row['student_solution']}\n\nКод автора:\n{solution_row['author_solution']}\n\nТестовые условия:\n{solution_row['input_output']}"
text = predict_func(input_text)
2024-10-16 20:24:08 +00:00
embedding = embedding2string(get_sentence_embedding(text))
2024-10-17 21:07:19 +00:00
submit_df.loc[i] = [solution_row['id'], text, embedding]
2024-10-16 20:24:08 +00:00
submit_df.to_csv(save_path, index=False)