This commit is contained in:
commit
162885a76d
|
@ -0,0 +1,19 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN pip install poetry
|
||||||
|
|
||||||
|
COPY pyproject.toml poetry.lock* /app/
|
||||||
|
|
||||||
|
RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi
|
||||||
|
|
||||||
|
COPY . /app
|
||||||
|
|
||||||
|
RUN mkdir -p /app/data/complete /app/data/processed /app/data/raw/test /app/data/raw/train
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
CMD ["poetry", "run", "python", "main.py"]
|
|
@ -0,0 +1,45 @@
|
||||||
|
# hse-python-assistant
|
||||||
|
|
||||||
|
TBD
|
||||||
|
|
||||||
|
## Ссылки
|
||||||
|
|
||||||
|
Тестирующая система: https://dsworks.ru/champ/hse-2024-october \
|
||||||
|
Лендинг: https://www.hse.ru/ai-assistant-hack-python/
|
||||||
|
|
||||||
|
## Запуск
|
||||||
|
|
||||||
|
TBD
|
||||||
|
|
||||||
|
## Структура проекта
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── app
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── models <------------------------ LLM в формате gguf
|
||||||
|
│ └── utils <------------------------ утилиты
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── metric.py <------------------------ ознакомьтесь с метрикой
|
||||||
|
│ └── submit.py <------------------------ здесь всё для генерации сабмита
|
||||||
|
├── data
|
||||||
|
│ ├── complete <------------------------ подготовленные данные, сабмиты
|
||||||
|
│ ├── processed <----------------------- промежуточный этап подготовки данных
|
||||||
|
│ └── raw <----------------------------- исходные данные
|
||||||
|
│ ├── submit_example.csv
|
||||||
|
│ ├── test
|
||||||
|
│ │ ├── solutions.xlsx
|
||||||
|
│ │ ├── tasks.xlsx
|
||||||
|
│ │ └── tests.xlsx
|
||||||
|
│ └── train
|
||||||
|
│ ├── solutions.xlsx
|
||||||
|
│ ├── tasks.xlsx
|
||||||
|
│ └── tests.xlsx
|
||||||
|
├── main.py <---------------------------- [ВАЖНО] Именно этот скрипт мы будем запускать при проверке ваших решений. Он должен генерировать финальный сабмит.
|
||||||
|
├── poetry.lock
|
||||||
|
├── pyproject.toml
|
||||||
|
├── README.md
|
||||||
|
└── tests
|
||||||
|
├── test_correctness.py <------------------------ проверить на корректность сабмит
|
||||||
|
└── test_embedding_generation.py <--------------- попробовать генерацию эмбеддингов и подсчёт метрики
|
||||||
|
```
|
|
@ -0,0 +1,35 @@
|
||||||
|
import pandas as pd
|
||||||
|
from torch.nn.functional import cosine_similarity
|
||||||
|
|
||||||
|
from app.utils.submit import string2embedding
|
||||||
|
|
||||||
|
|
||||||
|
def _get_cosine_similarity(pred_df: pd.DataFrame, true_df: pd.DataFrame) -> float:
|
||||||
|
predictions = pred_df["author_comment_embedding"]
|
||||||
|
true_values = true_df["author_comment_embedding"]
|
||||||
|
total_cos_sim = 0
|
||||||
|
|
||||||
|
for idx in range(len(true_values)):
|
||||||
|
pred_value = string2embedding(predictions.iloc[idx])
|
||||||
|
gt_value = string2embedding(true_values.iloc[idx])
|
||||||
|
|
||||||
|
if len(pred_value) != len(gt_value):
|
||||||
|
raise ValueError(f"Embeddings have different sizes: {len(pred_value)} != {len(gt_value)}")
|
||||||
|
|
||||||
|
cos_sim_value = cosine_similarity(pred_value.unsqueeze(0), gt_value.unsqueeze(0))
|
||||||
|
total_cos_sim += cos_sim_value
|
||||||
|
return float(total_cos_sim / len(true_df))
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_score(submit_path: str, gt_path: str) -> float:
|
||||||
|
submit_df = pd.read_csv(submit_path)
|
||||||
|
true_df = pd.read_excel(gt_path)
|
||||||
|
submit_df = submit_df[submit_df["solution_id"].isin(true_df["id"])]
|
||||||
|
return (_get_cosine_similarity(submit_df, true_df) - 0.6) / 0.4
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_score_and_save(submit_path: str, gt_path: str, save_path: str) -> float:
|
||||||
|
score = calculate_score(submit_path, gt_path)
|
||||||
|
with open(save_path, "w") as f:
|
||||||
|
f.write(f"{score}")
|
||||||
|
return score
|
|
@ -0,0 +1,47 @@
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import torch
|
||||||
|
from transformers import BertModel, BertTokenizer
|
||||||
|
|
||||||
|
print("Loading models...", end="")
|
||||||
|
model_name = "DeepPavlov/rubert-base-cased-sentence"
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(model_name)
|
||||||
|
model = BertModel.from_pretrained(model_name)
|
||||||
|
print("OK")
|
||||||
|
|
||||||
|
|
||||||
|
def get_sentence_embedding(sentence: str) -> torch.Tensor:
|
||||||
|
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
embedding = outputs.last_hidden_state[:, 0, :].squeeze()
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
|
||||||
|
def string2embedding(string: str) -> torch.Tensor:
|
||||||
|
return torch.Tensor([float(i) for i in string.split()])
|
||||||
|
|
||||||
|
|
||||||
|
def embedding2string(embedding: torch.Tensor) -> str:
|
||||||
|
return " ".join([str(i) for i in embedding.tolist()])
|
||||||
|
|
||||||
|
|
||||||
|
def generate_submit(test_solutions_path: str, predict_func: Callable, save_path: str, use_tqdm: bool = True) -> None:
|
||||||
|
test_solutions = pd.read_excel(test_solutions_path)
|
||||||
|
bar = range(len(test_solutions))
|
||||||
|
if use_tqdm:
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
bar = tqdm.tqdm(bar, desc="Predicting")
|
||||||
|
|
||||||
|
submit_df = pd.DataFrame(columns=["solution_id", "author_comment", "author_comment_embedding"])
|
||||||
|
for i in bar:
|
||||||
|
idx = test_solutions.index[i]
|
||||||
|
solution_row = test_solutions.iloc[i]
|
||||||
|
|
||||||
|
text = predict_func(solution_row) # here you can do absolute whatever you want
|
||||||
|
|
||||||
|
embedding = embedding2string(get_sentence_embedding(text))
|
||||||
|
submit_df.loc[i] = [idx, text, embedding]
|
||||||
|
submit_df.to_csv(save_path, index=False)
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,28 @@
|
||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
app:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
- ./data:/app/data
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
command: poetry run python main.py
|
||||||
|
|
||||||
|
# TODO: uncomment code below if you want to run it on GPU
|
||||||
|
# environment:
|
||||||
|
# - NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
# deploy:
|
||||||
|
# mode: replicated
|
||||||
|
# replicas: 1
|
||||||
|
# resources:
|
||||||
|
# reservations:
|
||||||
|
# devices:
|
||||||
|
# - driver: nvidia
|
||||||
|
# device_ids: [ '0' ]
|
||||||
|
# capabilities: [ gpu ]
|
|
@ -0,0 +1,31 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from app.models.yandexgpt import YandexGPT
|
||||||
|
from app.utils.submit import generate_submit
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
system_prompt = """
|
||||||
|
Ты - профессиональный программист и ментор. Давай очень короткие ответы о синтаксических ошибках в коде, если они есть.
|
||||||
|
"""
|
||||||
|
|
||||||
|
yandex_gpt = YandexGPT(
|
||||||
|
token=os.environ["YANDEX_GPT_IAM_TOKEN"],
|
||||||
|
folder_id=os.environ["YANDEX_GPT_FOLDER_ID"],
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def predict(row: pd.Series) -> str:
|
||||||
|
return yandex_gpt.ask(row["student_solution"])
|
||||||
|
|
||||||
|
|
||||||
|
generate_submit(
|
||||||
|
test_solutions_path="../data/raw/test/solutions.xlsx",
|
||||||
|
predict_func=predict,
|
||||||
|
save_path="../data/processed/submission.csv",
|
||||||
|
use_tqdm=True,
|
||||||
|
)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,30 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "hse-python-assistant"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Thanks, Beyonce team solution for HSE AI Assistant Hack: Python [https://www.hse.ru/ai-assistant-hack-python/]"
|
||||||
|
authors = ["Andrei Anikin <andreyf2357@gmail.com>", "Egor Gorokhov <9143999@gmail.com>", "Iaroslava Vinogradova <mikhailenko.yi@gmail.com>", "Oleg Zakharov <os.zakharov.04@gmail.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
requests = "^2.32.3"
|
||||||
|
python-dotenv = "^1.0.1"
|
||||||
|
pandas = "^2.2.3"
|
||||||
|
scikit-learn = "^1.5.2"
|
||||||
|
torch = "^2.4.1"
|
||||||
|
transformers = "^4.45.2"
|
||||||
|
openpyxl = "^3.1.5"
|
||||||
|
accelerate = "^1.0.1"
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
black = { extras = ["jupyter"], version = "^24.10.0" }
|
||||||
|
pre-commit = "^4.0.1"
|
||||||
|
jupyter = "^1.1.1"
|
||||||
|
tqdm = "^4.66.5"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 120
|
|
@ -0,0 +1,52 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from app.utils.submit import string2embedding
|
||||||
|
|
||||||
|
TEST_SIZE = 325
|
||||||
|
EMBEDDING_SIZE = 768
|
||||||
|
|
||||||
|
|
||||||
|
def _check_ids_correctness(submit_df: pd.DataFrame, submit_example_df: pd.DataFrame) -> bool:
|
||||||
|
not_presented = set(submit_example_df["solution_id"]) - set(submit_df["solution_id"])
|
||||||
|
not_needed = set(submit_df["solution_id"]) - set(submit_example_df["solution_id"])
|
||||||
|
|
||||||
|
not_presented = list(not_presented)
|
||||||
|
not_presented.sort()
|
||||||
|
not_needed = list(not_needed)
|
||||||
|
not_needed.sort()
|
||||||
|
|
||||||
|
error_message = "Submit is incorrect."
|
||||||
|
if len(not_presented) + len(not_needed) > 0:
|
||||||
|
if len(not_presented) > 0:
|
||||||
|
error_message += f" Not presented solution_id: {not_presented}."
|
||||||
|
if len(not_needed) > 0:
|
||||||
|
error_message += f" Not needed solution_id: {not_needed}."
|
||||||
|
raise ValueError(error_message)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _check_rows_size_correctness(submit_df: pd.DataFrame) -> bool:
|
||||||
|
incorrect_rows = []
|
||||||
|
for idx in range(TEST_SIZE):
|
||||||
|
if len(string2embedding(submit_df["author_comment_embedding"].iloc[idx])) != EMBEDDING_SIZE:
|
||||||
|
incorrect_rows.append(idx)
|
||||||
|
if len(incorrect_rows) > 0:
|
||||||
|
raise ValueError(f"Submit has incorrect rows: {incorrect_rows}. (incorrect size of embedding)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def check_submit_correctness(submit_path: str, submit_example_path: str) -> bool:
|
||||||
|
if not submit_path.endswith(".csv"):
|
||||||
|
raise ValueError(f"{submit_path} is not a .csv file.")
|
||||||
|
|
||||||
|
submit_df = pd.read_csv(submit_path)
|
||||||
|
submit_example_df = pd.read_csv(submit_example_path)
|
||||||
|
|
||||||
|
_check_ids_correctness(submit_df, submit_example_df)
|
||||||
|
_check_rows_size_correctness(submit_df)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
check_submit_correctness(submit_path="data/complete/submit.csv", submit_example_path="data/raw/submit_example.csv")
|
|
@ -0,0 +1,17 @@
|
||||||
|
from app.utils.metric import cosine_similarity
|
||||||
|
from app.utils.submit import get_sentence_embedding
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
sentence1 = "Вы забыли поставить префикс f перед строкой, переданной функции print()."
|
||||||
|
sentence2 = "Вы забыли поставить префикс f перед строкой."
|
||||||
|
|
||||||
|
embedding1 = get_sentence_embedding(sentence1)
|
||||||
|
embedding2 = get_sentence_embedding(sentence2)
|
||||||
|
|
||||||
|
cos_sim = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
|
||||||
|
print(f"Cosine Similarity: {cos_sim.item():.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
Loading…
Reference in New Issue