v1

2024-10-16 23:24:08 +03:00 · 2024-10-16 23:24:08 +03:00 · 162885a76d
commit 162885a76d
26 changed files with 4163 additions and 0 deletions
--- a/19
+++ b/19
@ -0,0 +1,19 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN pip install poetry
+
+COPY pyproject.toml poetry.lock* /app/
+
+RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi
+
+COPY . /app
+
+RUN mkdir -p /app/data/complete /app/data/processed /app/data/raw/test /app/data/raw/train
+
+EXPOSE 8000
+
+ENV PYTHONUNBUFFERED=1
+
+CMD ["poetry", "run", "python", "main.py"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,45 @@
+# hse-python-assistant
+
+TBD
+
+## Ссылки
+
+Тестирующая система: https://dsworks.ru/champ/hse-2024-october \
+Лендинг: https://www.hse.ru/ai-assistant-hack-python/
+
+## Запуск
+
+TBD
+
+## Структура проекта
+
+```
+.
+├── app
+│   ├── __init__.py
+│   ├── models   <------------------------ LLM в формате gguf
+│   └── utils    <------------------------ утилиты
+│       ├── __init__.py
+│       ├── metric.py <------------------------ ознакомьтесь с метрикой
+│       └── submit.py <------------------------ здесь всё для генерации сабмита
+├── data
+│   ├── complete <------------------------ подготовленные данные, сабмиты
+│   ├── processed <----------------------- промежуточный этап подготовки данных
+│   └── raw <----------------------------- исходные данные
+│       ├── submit_example.csv
+│       ├── test
+│       │   ├── solutions.xlsx
+│       │   ├── tasks.xlsx
+│       │   └── tests.xlsx
+│       └── train
+│           ├── solutions.xlsx
+│           ├── tasks.xlsx
+│           └── tests.xlsx
+├── main.py <---------------------------- [ВАЖНО] Именно этот скрипт мы будем запускать при проверке ваших решений. Он должен генерировать финальный сабмит.
+├── poetry.lock
+├── pyproject.toml
+├── README.md
+└── tests
+    ├── test_correctness.py <------------------------ проверить на корректность сабмит
+    └── test_embedding_generation.py <--------------- попробовать генерацию эмбеддингов и подсчёт метрики
+```
--- a/app/init.py
+++ b/app/init.py
--- a/app/utils/init.py
+++ b/app/utils/init.py
--- a/app/utils/metric.py
+++ b/app/utils/metric.py
@ -0,0 +1,35 @@
+import pandas as pd
+from torch.nn.functional import cosine_similarity
+
+from app.utils.submit import string2embedding
+
+
+def _get_cosine_similarity(pred_df: pd.DataFrame, true_df: pd.DataFrame) -> float:
+    predictions = pred_df["author_comment_embedding"]
+    true_values = true_df["author_comment_embedding"]
+    total_cos_sim = 0
+
+    for idx in range(len(true_values)):
+        pred_value = string2embedding(predictions.iloc[idx])
+        gt_value = string2embedding(true_values.iloc[idx])
+
+        if len(pred_value) != len(gt_value):
+            raise ValueError(f"Embeddings have different sizes: {len(pred_value)} != {len(gt_value)}")
+
+        cos_sim_value = cosine_similarity(pred_value.unsqueeze(0), gt_value.unsqueeze(0))
+        total_cos_sim += cos_sim_value
+    return float(total_cos_sim / len(true_df))
+
+
+def calculate_score(submit_path: str, gt_path: str) -> float:
+    submit_df = pd.read_csv(submit_path)
+    true_df = pd.read_excel(gt_path)
+    submit_df = submit_df[submit_df["solution_id"].isin(true_df["id"])]
+    return (_get_cosine_similarity(submit_df, true_df) - 0.6) / 0.4
+
+
+def calculate_score_and_save(submit_path: str, gt_path: str, save_path: str) -> float:
+    score = calculate_score(submit_path, gt_path)
+    with open(save_path, "w") as f:
+        f.write(f"{score}")
+    return score
--- a/app/utils/submit.py
+++ b/app/utils/submit.py
@ -0,0 +1,47 @@
+from typing import Callable
+
+import pandas as pd
+import torch
+from transformers import BertModel, BertTokenizer
+
+print("Loading models...", end="")
+model_name = "DeepPavlov/rubert-base-cased-sentence"
+tokenizer = BertTokenizer.from_pretrained(model_name)
+model = BertModel.from_pretrained(model_name)
+print("OK")
+
+
+def get_sentence_embedding(sentence: str) -> torch.Tensor:
+    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        embedding = outputs.last_hidden_state[:, 0, :].squeeze()
+    return embedding
+
+
+def string2embedding(string: str) -> torch.Tensor:
+    return torch.Tensor([float(i) for i in string.split()])
+
+
+def embedding2string(embedding: torch.Tensor) -> str:
+    return " ".join([str(i) for i in embedding.tolist()])
+
+
+def generate_submit(test_solutions_path: str, predict_func: Callable, save_path: str, use_tqdm: bool = True) -> None:
+    test_solutions = pd.read_excel(test_solutions_path)
+    bar = range(len(test_solutions))
+    if use_tqdm:
+        import tqdm
+
+        bar = tqdm.tqdm(bar, desc="Predicting")
+
+    submit_df = pd.DataFrame(columns=["solution_id", "author_comment", "author_comment_embedding"])
+    for i in bar:
+        idx = test_solutions.index[i]
+        solution_row = test_solutions.iloc[i]
+
+        text = predict_func(solution_row)  # here you can do absolute whatever you want
+
+        embedding = embedding2string(get_sentence_embedding(text))
+        submit_df.loc[i] = [idx, text, embedding]
+    submit_df.to_csv(save_path, index=False)
--- a/data/.gitkeep
+++ b/data/.gitkeep
--- a/data/complete/.gitkeep
+++ b/data/complete/.gitkeep
--- a/data/processed/.gitkeep
+++ b/data/processed/.gitkeep
--- a/data/raw/.gitkeep
+++ b/data/raw/.gitkeep
--- a/data/raw/for_teams/submit_example.csv
+++ b/data/raw/for_teams/submit_example.csv
--- a/data/raw/test/.gitkeep
+++ b/data/raw/test/.gitkeep
--- a/data/raw/test/solutions.xlsx
+++ b/data/raw/test/solutions.xlsx
--- a/data/raw/test/tasks.xlsx
+++ b/data/raw/test/tasks.xlsx
--- a/data/raw/test/tests.xlsx
+++ b/data/raw/test/tests.xlsx
--- a/data/raw/train/.gitkeep
+++ b/data/raw/train/.gitkeep
--- a/data/raw/train/solutions.xlsx
+++ b/data/raw/train/solutions.xlsx
--- a/data/raw/train/tasks.xlsx
+++ b/data/raw/train/tasks.xlsx
--- a/data/raw/train/tests.xlsx
+++ b/data/raw/train/tests.xlsx
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,28 @@
+version: "3.8"
+
+services:
+  app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    volumes:
+      - .:/app
+      - ./data:/app/data
+    ports:
+      - "8000:8000"
+    environment:
+      - PYTHONUNBUFFERED=1
+    command: poetry run python main.py
+
+# TODO: uncomment code below if you want to run it on GPU
+#  environment:
+#    - NVIDIA_VISIBLE_DEVICES=all
+#  deploy:
+#    mode: replicated
+#    replicas: 1
+#    resources:
+#      reservations:
+#        devices:
+#          - driver: nvidia
+#            device_ids: [ '0' ]
+#            capabilities: [ gpu ]
--- a/main.py
+++ b/main.py
@ -0,0 +1,31 @@
+import os
+
+import pandas as pd
+
+from app.models.yandexgpt import YandexGPT
+from app.utils.submit import generate_submit
+
+if __name__ == "__main__":
+    load_dotenv()
+
+    system_prompt = """
+    Ты - профессиональный программист и ментор. Давай очень короткие ответы о синтаксических ошибках в коде, если они есть.
+    """
+
+    yandex_gpt = YandexGPT(
+        token=os.environ["YANDEX_GPT_IAM_TOKEN"],
+        folder_id=os.environ["YANDEX_GPT_FOLDER_ID"],
+        system_prompt=system_prompt,
+    )
+
+
+    def predict(row: pd.Series) -> str:
+        return yandex_gpt.ask(row["student_solution"])
+
+
+    generate_submit(
+        test_solutions_path="../data/raw/test/solutions.xlsx",
+        predict_func=predict,
+        save_path="../data/processed/submission.csv",
+        use_tqdm=True,
+    )
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,30 @@
+[tool.poetry]
+name = "hse-python-assistant"
+version = "0.1.0"
+description = "Thanks, Beyonce team solution for HSE AI Assistant Hack: Python [https://www.hse.ru/ai-assistant-hack-python/]"
+authors = ["Andrei Anikin <andreyf2357@gmail.com>", "Egor Gorokhov <9143999@gmail.com>", "Iaroslava Vinogradova <mikhailenko.yi@gmail.com>", "Oleg Zakharov <os.zakharov.04@gmail.com>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+requests = "^2.32.3"
+python-dotenv = "^1.0.1"
+pandas = "^2.2.3"
+scikit-learn = "^1.5.2"
+torch = "^2.4.1"
+transformers = "^4.45.2"
+openpyxl = "^3.1.5"
+accelerate = "^1.0.1"
+
+[tool.poetry.group.dev.dependencies]
+black = { extras = ["jupyter"], version = "^24.10.0" }
+pre-commit = "^4.0.1"
+jupyter = "^1.1.1"
+tqdm = "^4.66.5"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.black]
+line-length = 120
--- a/tests/.gitkeep
+++ b/tests/.gitkeep
--- a/tests/test_correctness.py
+++ b/tests/test_correctness.py
@ -0,0 +1,52 @@
+import pandas as pd
+
+from app.utils.submit import string2embedding
+
+TEST_SIZE = 325
+EMBEDDING_SIZE = 768
+
+
+def _check_ids_correctness(submit_df: pd.DataFrame, submit_example_df: pd.DataFrame) -> bool:
+    not_presented = set(submit_example_df["solution_id"]) - set(submit_df["solution_id"])
+    not_needed = set(submit_df["solution_id"]) - set(submit_example_df["solution_id"])
+
+    not_presented = list(not_presented)
+    not_presented.sort()
+    not_needed = list(not_needed)
+    not_needed.sort()
+
+    error_message = "Submit is incorrect."
+    if len(not_presented) + len(not_needed) > 0:
+        if len(not_presented) > 0:
+            error_message += f" Not presented solution_id: {not_presented}."
+        if len(not_needed) > 0:
+            error_message += f" Not needed solution_id: {not_needed}."
+        raise ValueError(error_message)
+    return True
+
+
+def _check_rows_size_correctness(submit_df: pd.DataFrame) -> bool:
+    incorrect_rows = []
+    for idx in range(TEST_SIZE):
+        if len(string2embedding(submit_df["author_comment_embedding"].iloc[idx])) != EMBEDDING_SIZE:
+            incorrect_rows.append(idx)
+    if len(incorrect_rows) > 0:
+        raise ValueError(f"Submit has incorrect rows: {incorrect_rows}. (incorrect size of embedding)")
+    return True
+
+
+def check_submit_correctness(submit_path: str, submit_example_path: str) -> bool:
+    if not submit_path.endswith(".csv"):
+        raise ValueError(f"{submit_path} is not a .csv file.")
+
+    submit_df = pd.read_csv(submit_path)
+    submit_example_df = pd.read_csv(submit_example_path)
+
+    _check_ids_correctness(submit_df, submit_example_df)
+    _check_rows_size_correctness(submit_df)
+
+    return True
+
+
+if __name__ == "__main__":
+    check_submit_correctness(submit_path="data/complete/submit.csv", submit_example_path="data/raw/submit_example.csv")
--- a/tests/test_embedding_generation.py
+++ b/tests/test_embedding_generation.py
@ -0,0 +1,17 @@
+from app.utils.metric import cosine_similarity
+from app.utils.submit import get_sentence_embedding
+
+
+def test():
+    sentence1 = "Вы забыли поставить префикс f перед строкой, переданной функции print()."
+    sentence2 = "Вы забыли поставить префикс f перед строкой."
+
+    embedding1 = get_sentence_embedding(sentence1)
+    embedding2 = get_sentence_embedding(sentence2)
+
+    cos_sim = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
+    print(f"Cosine Similarity: {cos_sim.item():.4f}")
+
+
+if __name__ == "__main__":
+    test()