This commit is contained in:
Zakharov Oleg 2024-10-16 23:24:08 +03:00
commit 162885a76d
26 changed files with 4163 additions and 0 deletions

19
Dockerfile Normal file
View File

@ -0,0 +1,19 @@
FROM python:3.11-slim
WORKDIR /app
RUN pip install poetry
COPY pyproject.toml poetry.lock* /app/
RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi
COPY . /app
RUN mkdir -p /app/data/complete /app/data/processed /app/data/raw/test /app/data/raw/train
EXPOSE 8000
ENV PYTHONUNBUFFERED=1
CMD ["poetry", "run", "python", "main.py"]

45
README.md Normal file
View File

@ -0,0 +1,45 @@
# hse-python-assistant
TBD
## Ссылки
Тестирующая система: https://dsworks.ru/champ/hse-2024-october \
Лендинг: https://www.hse.ru/ai-assistant-hack-python/
## Запуск
TBD
## Структура проекта
```
.
├── app
│   ├── __init__.py
│   ├── models <------------------------ LLM в формате gguf
│   └── utils <------------------------ утилиты
│   ├── __init__.py
│   ├── metric.py <------------------------ ознакомьтесь с метрикой
│   └── submit.py <------------------------ здесь всё для генерации сабмита
├── data
│   ├── complete <------------------------ подготовленные данные, сабмиты
│   ├── processed <----------------------- промежуточный этап подготовки данных
│   └── raw <----------------------------- исходные данные
│   ├── submit_example.csv
│   ├── test
│   │   ├── solutions.xlsx
│   │   ├── tasks.xlsx
│   │   └── tests.xlsx
│   └── train
│   ├── solutions.xlsx
│   ├── tasks.xlsx
│   └── tests.xlsx
├── main.py <---------------------------- [ВАЖНО] Именно этот скрипт мы будем запускать при проверке ваших решений. Он должен генерировать финальный сабмит.
├── poetry.lock
├── pyproject.toml
├── README.md
└── tests
├── test_correctness.py <------------------------ проверить на корректность сабмит
└── test_embedding_generation.py <--------------- попробовать генерацию эмбеддингов и подсчёт метрики
```

0
app/__init__.py Normal file
View File

0
app/utils/__init__.py Normal file
View File

35
app/utils/metric.py Normal file
View File

@ -0,0 +1,35 @@
import pandas as pd
from torch.nn.functional import cosine_similarity
from app.utils.submit import string2embedding
def _get_cosine_similarity(pred_df: pd.DataFrame, true_df: pd.DataFrame) -> float:
predictions = pred_df["author_comment_embedding"]
true_values = true_df["author_comment_embedding"]
total_cos_sim = 0
for idx in range(len(true_values)):
pred_value = string2embedding(predictions.iloc[idx])
gt_value = string2embedding(true_values.iloc[idx])
if len(pred_value) != len(gt_value):
raise ValueError(f"Embeddings have different sizes: {len(pred_value)} != {len(gt_value)}")
cos_sim_value = cosine_similarity(pred_value.unsqueeze(0), gt_value.unsqueeze(0))
total_cos_sim += cos_sim_value
return float(total_cos_sim / len(true_df))
def calculate_score(submit_path: str, gt_path: str) -> float:
submit_df = pd.read_csv(submit_path)
true_df = pd.read_excel(gt_path)
submit_df = submit_df[submit_df["solution_id"].isin(true_df["id"])]
return (_get_cosine_similarity(submit_df, true_df) - 0.6) / 0.4
def calculate_score_and_save(submit_path: str, gt_path: str, save_path: str) -> float:
score = calculate_score(submit_path, gt_path)
with open(save_path, "w") as f:
f.write(f"{score}")
return score

47
app/utils/submit.py Normal file
View File

@ -0,0 +1,47 @@
from typing import Callable
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
print("Loading models...", end="")
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
print("OK")
def get_sentence_embedding(sentence: str) -> torch.Tensor:
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :].squeeze()
return embedding
def string2embedding(string: str) -> torch.Tensor:
return torch.Tensor([float(i) for i in string.split()])
def embedding2string(embedding: torch.Tensor) -> str:
return " ".join([str(i) for i in embedding.tolist()])
def generate_submit(test_solutions_path: str, predict_func: Callable, save_path: str, use_tqdm: bool = True) -> None:
test_solutions = pd.read_excel(test_solutions_path)
bar = range(len(test_solutions))
if use_tqdm:
import tqdm
bar = tqdm.tqdm(bar, desc="Predicting")
submit_df = pd.DataFrame(columns=["solution_id", "author_comment", "author_comment_embedding"])
for i in bar:
idx = test_solutions.index[i]
solution_row = test_solutions.iloc[i]
text = predict_func(solution_row) # here you can do absolute whatever you want
embedding = embedding2string(get_sentence_embedding(text))
submit_df.loc[i] = [idx, text, embedding]
submit_df.to_csv(save_path, index=False)

0
data/.gitkeep Normal file
View File

0
data/complete/.gitkeep Normal file
View File

0
data/processed/.gitkeep Normal file
View File

0
data/raw/.gitkeep Normal file
View File

File diff suppressed because one or more lines are too long

0
data/raw/test/.gitkeep Normal file
View File

Binary file not shown.

BIN
data/raw/test/tasks.xlsx Normal file

Binary file not shown.

BIN
data/raw/test/tests.xlsx Normal file

Binary file not shown.

0
data/raw/train/.gitkeep Normal file
View File

Binary file not shown.

BIN
data/raw/train/tasks.xlsx Normal file

Binary file not shown.

BIN
data/raw/train/tests.xlsx Normal file

Binary file not shown.

28
docker-compose.yaml Normal file
View File

@ -0,0 +1,28 @@
version: "3.8"
services:
app:
build:
context: .
dockerfile: Dockerfile
volumes:
- .:/app
- ./data:/app/data
ports:
- "8000:8000"
environment:
- PYTHONUNBUFFERED=1
command: poetry run python main.py
# TODO: uncomment code below if you want to run it on GPU
# environment:
# - NVIDIA_VISIBLE_DEVICES=all
# deploy:
# mode: replicated
# replicas: 1
# resources:
# reservations:
# devices:
# - driver: nvidia
# device_ids: [ '0' ]
# capabilities: [ gpu ]

31
main.py Normal file
View File

@ -0,0 +1,31 @@
import os
import pandas as pd
from app.models.yandexgpt import YandexGPT
from app.utils.submit import generate_submit
if __name__ == "__main__":
load_dotenv()
system_prompt = """
Ты - профессиональный программист и ментор. Давай очень короткие ответы о синтаксических ошибках в коде, если они есть.
"""
yandex_gpt = YandexGPT(
token=os.environ["YANDEX_GPT_IAM_TOKEN"],
folder_id=os.environ["YANDEX_GPT_FOLDER_ID"],
system_prompt=system_prompt,
)
def predict(row: pd.Series) -> str:
return yandex_gpt.ask(row["student_solution"])
generate_submit(
test_solutions_path="../data/raw/test/solutions.xlsx",
predict_func=predict,
save_path="../data/processed/submission.csv",
use_tqdm=True,
)

3533
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

30
pyproject.toml Normal file
View File

@ -0,0 +1,30 @@
[tool.poetry]
name = "hse-python-assistant"
version = "0.1.0"
description = "Thanks, Beyonce team solution for HSE AI Assistant Hack: Python [https://www.hse.ru/ai-assistant-hack-python/]"
authors = ["Andrei Anikin <andreyf2357@gmail.com>", "Egor Gorokhov <9143999@gmail.com>", "Iaroslava Vinogradova <mikhailenko.yi@gmail.com>", "Oleg Zakharov <os.zakharov.04@gmail.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
requests = "^2.32.3"
python-dotenv = "^1.0.1"
pandas = "^2.2.3"
scikit-learn = "^1.5.2"
torch = "^2.4.1"
transformers = "^4.45.2"
openpyxl = "^3.1.5"
accelerate = "^1.0.1"
[tool.poetry.group.dev.dependencies]
black = { extras = ["jupyter"], version = "^24.10.0" }
pre-commit = "^4.0.1"
jupyter = "^1.1.1"
tqdm = "^4.66.5"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 120

0
tests/.gitkeep Normal file
View File

52
tests/test_correctness.py Normal file
View File

@ -0,0 +1,52 @@
import pandas as pd
from app.utils.submit import string2embedding
TEST_SIZE = 325
EMBEDDING_SIZE = 768
def _check_ids_correctness(submit_df: pd.DataFrame, submit_example_df: pd.DataFrame) -> bool:
not_presented = set(submit_example_df["solution_id"]) - set(submit_df["solution_id"])
not_needed = set(submit_df["solution_id"]) - set(submit_example_df["solution_id"])
not_presented = list(not_presented)
not_presented.sort()
not_needed = list(not_needed)
not_needed.sort()
error_message = "Submit is incorrect."
if len(not_presented) + len(not_needed) > 0:
if len(not_presented) > 0:
error_message += f" Not presented solution_id: {not_presented}."
if len(not_needed) > 0:
error_message += f" Not needed solution_id: {not_needed}."
raise ValueError(error_message)
return True
def _check_rows_size_correctness(submit_df: pd.DataFrame) -> bool:
incorrect_rows = []
for idx in range(TEST_SIZE):
if len(string2embedding(submit_df["author_comment_embedding"].iloc[idx])) != EMBEDDING_SIZE:
incorrect_rows.append(idx)
if len(incorrect_rows) > 0:
raise ValueError(f"Submit has incorrect rows: {incorrect_rows}. (incorrect size of embedding)")
return True
def check_submit_correctness(submit_path: str, submit_example_path: str) -> bool:
if not submit_path.endswith(".csv"):
raise ValueError(f"{submit_path} is not a .csv file.")
submit_df = pd.read_csv(submit_path)
submit_example_df = pd.read_csv(submit_example_path)
_check_ids_correctness(submit_df, submit_example_df)
_check_rows_size_correctness(submit_df)
return True
if __name__ == "__main__":
check_submit_correctness(submit_path="data/complete/submit.csv", submit_example_path="data/raw/submit_example.csv")

View File

@ -0,0 +1,17 @@
from app.utils.metric import cosine_similarity
from app.utils.submit import get_sentence_embedding
def test():
sentence1 = "Вы забыли поставить префикс f перед строкой, переданной функции print()."
sentence2 = "Вы забыли поставить префикс f перед строкой."
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)
cos_sim = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
print(f"Cosine Similarity: {cos_sim.item():.4f}")
if __name__ == "__main__":
test()