v1

2024-10-16 23:24:08 +03:00 · 2024-10-16 23:24:08 +03:00 · 162885a76d
commit 162885a76d
26 changed files with 4163 additions and 0 deletions
--- a/19
+++ b/19
@ -0,0 +1,19 @@
 FROM python:3.11-slim
 WORKDIR /app
 RUN pip install poetry
 COPY pyproject.toml poetry.lock* /app/
 RUN poetry config virtualenvs.create false && poetry install --no-interaction --no-ansi
 COPY . /app
 RUN mkdir -p /app/data/complete /app/data/processed /app/data/raw/test /app/data/raw/train
 EXPOSE 8000
 ENV PYTHONUNBUFFERED=1
 CMD ["poetry", "run", "python", "main.py"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,45 @@
 # hse-python-assistant
 TBD
 ## Ссылки
 Тестирующая система: https://dsworks.ru/champ/hse-2024-october \
 Лендинг: https://www.hse.ru/ai-assistant-hack-python/
 ## Запуск
 TBD
 ## Структура проекта
 ```
 .
 ├── app
 │   ├── __init__.py
 │   ├── models   <------------------------ LLM в формате gguf
 │   └── utils    <------------------------ утилиты
 │       ├── __init__.py
 │       ├── metric.py <------------------------ ознакомьтесь с метрикой
 │       └── submit.py <------------------------ здесь всё для генерации сабмита
 ├── data
 │   ├── complete <------------------------ подготовленные данные, сабмиты
 │   ├── processed <----------------------- промежуточный этап подготовки данных
 │   └── raw <----------------------------- исходные данные
 │       ├── submit_example.csv
 │       ├── test
 │       │   ├── solutions.xlsx
 │       │   ├── tasks.xlsx
 │       │   └── tests.xlsx
 │       └── train
 │           ├── solutions.xlsx
 │           ├── tasks.xlsx
 │           └── tests.xlsx
 ├── main.py <---------------------------- [ВАЖНО] Именно этот скрипт мы будем запускать при проверке ваших решений. Он должен генерировать финальный сабмит.
 ├── poetry.lock
 ├── pyproject.toml
 ├── README.md
 └── tests
    ├── test_correctness.py <------------------------ проверить на корректность сабмит
    └── test_embedding_generation.py <--------------- попробовать генерацию эмбеддингов и подсчёт метрики
 ```
--- a/app/init.py
+++ b/app/init.py
--- a/app/utils/init.py
+++ b/app/utils/init.py
--- a/app/utils/metric.py
+++ b/app/utils/metric.py
@ -0,0 +1,35 @@
 import pandas as pd
 from torch.nn.functional import cosine_similarity
 from app.utils.submit import string2embedding
 def _get_cosine_similarity(pred_df: pd.DataFrame, true_df: pd.DataFrame) -> float:
    predictions = pred_df["author_comment_embedding"]
    true_values = true_df["author_comment_embedding"]
    total_cos_sim = 0
    for idx in range(len(true_values)):
        pred_value = string2embedding(predictions.iloc[idx])
        gt_value = string2embedding(true_values.iloc[idx])
        if len(pred_value) != len(gt_value):
            raise ValueError(f"Embeddings have different sizes: {len(pred_value)} != {len(gt_value)}")
        cos_sim_value = cosine_similarity(pred_value.unsqueeze(0), gt_value.unsqueeze(0))
        total_cos_sim += cos_sim_value
    return float(total_cos_sim / len(true_df))
 def calculate_score(submit_path: str, gt_path: str) -> float:
    submit_df = pd.read_csv(submit_path)
    true_df = pd.read_excel(gt_path)
    submit_df = submit_df[submit_df["solution_id"].isin(true_df["id"])]
    return (_get_cosine_similarity(submit_df, true_df) - 0.6) / 0.4
 def calculate_score_and_save(submit_path: str, gt_path: str, save_path: str) -> float:
    score = calculate_score(submit_path, gt_path)
    with open(save_path, "w") as f:
        f.write(f"{score}")
    return score
--- a/app/utils/submit.py
+++ b/app/utils/submit.py
@ -0,0 +1,47 @@
 from typing import Callable
 import pandas as pd
 import torch
 from transformers import BertModel, BertTokenizer
 print("Loading models...", end="")
 model_name = "DeepPavlov/rubert-base-cased-sentence"
 tokenizer = BertTokenizer.from_pretrained(model_name)
 model = BertModel.from_pretrained(model_name)
 print("OK")
 def get_sentence_embedding(sentence: str) -> torch.Tensor:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    return embedding
 def string2embedding(string: str) -> torch.Tensor:
    return torch.Tensor([float(i) for i in string.split()])
 def embedding2string(embedding: torch.Tensor) -> str:
    return " ".join([str(i) for i in embedding.tolist()])
 def generate_submit(test_solutions_path: str, predict_func: Callable, save_path: str, use_tqdm: bool = True) -> None:
    test_solutions = pd.read_excel(test_solutions_path)
    bar = range(len(test_solutions))
    if use_tqdm:
        import tqdm
        bar = tqdm.tqdm(bar, desc="Predicting")
    submit_df = pd.DataFrame(columns=["solution_id", "author_comment", "author_comment_embedding"])
    for i in bar:
        idx = test_solutions.index[i]
        solution_row = test_solutions.iloc[i]
        text = predict_func(solution_row)  # here you can do absolute whatever you want
        embedding = embedding2string(get_sentence_embedding(text))
        submit_df.loc[i] = [idx, text, embedding]
    submit_df.to_csv(save_path, index=False)
--- a/data/.gitkeep
+++ b/data/.gitkeep
--- a/data/complete/.gitkeep
+++ b/data/complete/.gitkeep
--- a/data/processed/.gitkeep
+++ b/data/processed/.gitkeep
--- a/data/raw/.gitkeep
+++ b/data/raw/.gitkeep
--- a/data/raw/for_teams/submit_example.csv
+++ b/data/raw/for_teams/submit_example.csv
--- a/data/raw/test/.gitkeep
+++ b/data/raw/test/.gitkeep
--- a/data/raw/test/solutions.xlsx
+++ b/data/raw/test/solutions.xlsx
--- a/data/raw/test/tasks.xlsx
+++ b/data/raw/test/tasks.xlsx
--- a/data/raw/test/tests.xlsx
+++ b/data/raw/test/tests.xlsx
--- a/data/raw/train/.gitkeep
+++ b/data/raw/train/.gitkeep
--- a/data/raw/train/solutions.xlsx
+++ b/data/raw/train/solutions.xlsx
--- a/data/raw/train/tasks.xlsx
+++ b/data/raw/train/tasks.xlsx
--- a/data/raw/train/tests.xlsx
+++ b/data/raw/train/tests.xlsx
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,28 @@
 version: "3.8"
 services:
  app:
    build:
      context: .
      dockerfile: Dockerfile
    volumes:
      - .:/app
      - ./data:/app/data
    ports:
      - "8000:8000"
    environment:
      - PYTHONUNBUFFERED=1
    command: poetry run python main.py
 # TODO: uncomment code below if you want to run it on GPU
 #  environment:
 #    - NVIDIA_VISIBLE_DEVICES=all
 #  deploy:
 #    mode: replicated
 #    replicas: 1
 #    resources:
 #      reservations:
 #        devices:
 #          - driver: nvidia
 #            device_ids: [ '0' ]
 #            capabilities: [ gpu ]
--- a/main.py
+++ b/main.py
@ -0,0 +1,31 @@
 import os
 import pandas as pd
 from app.models.yandexgpt import YandexGPT
 from app.utils.submit import generate_submit
 if __name__ == "__main__":
    load_dotenv()
    system_prompt = """
    Ты - профессиональный программист и ментор. Давай очень короткие ответы о синтаксических ошибках в коде, если они есть.
    """
    yandex_gpt = YandexGPT(
        token=os.environ["YANDEX_GPT_IAM_TOKEN"],
        folder_id=os.environ["YANDEX_GPT_FOLDER_ID"],
        system_prompt=system_prompt,
    )
    def predict(row: pd.Series) -> str:
        return yandex_gpt.ask(row["student_solution"])
    generate_submit(
        test_solutions_path="../data/raw/test/solutions.xlsx",
        predict_func=predict,
        save_path="../data/processed/submission.csv",
        use_tqdm=True,
    )
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,30 @@
 [tool.poetry]
 name = "hse-python-assistant"
 version = "0.1.0"
 description = "Thanks, Beyonce team solution for HSE AI Assistant Hack: Python [https://www.hse.ru/ai-assistant-hack-python/]"
 authors = ["Andrei Anikin <andreyf2357@gmail.com>", "Egor Gorokhov <9143999@gmail.com>", "Iaroslava Vinogradova <mikhailenko.yi@gmail.com>", "Oleg Zakharov <os.zakharov.04@gmail.com>"]
 readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.11"
 requests = "^2.32.3"
 python-dotenv = "^1.0.1"
 pandas = "^2.2.3"
 scikit-learn = "^1.5.2"
 torch = "^2.4.1"
 transformers = "^4.45.2"
 openpyxl = "^3.1.5"
 accelerate = "^1.0.1"
 [tool.poetry.group.dev.dependencies]
 black = { extras = ["jupyter"], version = "^24.10.0" }
 pre-commit = "^4.0.1"
 jupyter = "^1.1.1"
 tqdm = "^4.66.5"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 [tool.black]
 line-length = 120
--- a/tests/.gitkeep
+++ b/tests/.gitkeep
--- a/tests/test_correctness.py
+++ b/tests/test_correctness.py
@ -0,0 +1,52 @@
 import pandas as pd
 from app.utils.submit import string2embedding
 TEST_SIZE = 325
 EMBEDDING_SIZE = 768
 def _check_ids_correctness(submit_df: pd.DataFrame, submit_example_df: pd.DataFrame) -> bool:
    not_presented = set(submit_example_df["solution_id"]) - set(submit_df["solution_id"])
    not_needed = set(submit_df["solution_id"]) - set(submit_example_df["solution_id"])
    not_presented = list(not_presented)
    not_presented.sort()
    not_needed = list(not_needed)
    not_needed.sort()
    error_message = "Submit is incorrect."
    if len(not_presented) + len(not_needed) > 0:
        if len(not_presented) > 0:
            error_message += f" Not presented solution_id: {not_presented}."
        if len(not_needed) > 0:
            error_message += f" Not needed solution_id: {not_needed}."
        raise ValueError(error_message)
    return True
 def _check_rows_size_correctness(submit_df: pd.DataFrame) -> bool:
    incorrect_rows = []
    for idx in range(TEST_SIZE):
        if len(string2embedding(submit_df["author_comment_embedding"].iloc[idx])) != EMBEDDING_SIZE:
            incorrect_rows.append(idx)
    if len(incorrect_rows) > 0:
        raise ValueError(f"Submit has incorrect rows: {incorrect_rows}. (incorrect size of embedding)")
    return True
 def check_submit_correctness(submit_path: str, submit_example_path: str) -> bool:
    if not submit_path.endswith(".csv"):
        raise ValueError(f"{submit_path} is not a .csv file.")
    submit_df = pd.read_csv(submit_path)
    submit_example_df = pd.read_csv(submit_example_path)
    _check_ids_correctness(submit_df, submit_example_df)
    _check_rows_size_correctness(submit_df)
    return True
 if __name__ == "__main__":
    check_submit_correctness(submit_path="data/complete/submit.csv", submit_example_path="data/raw/submit_example.csv")
--- a/tests/test_embedding_generation.py
+++ b/tests/test_embedding_generation.py
@ -0,0 +1,17 @@
 from app.utils.metric import cosine_similarity
 from app.utils.submit import get_sentence_embedding
 def test():
    sentence1 = "Вы забыли поставить префикс f перед строкой, переданной функции print()."
    sentence2 = "Вы забыли поставить префикс f перед строкой."
    embedding1 = get_sentence_embedding(sentence1)
    embedding2 = get_sentence_embedding(sentence2)
    cos_sim = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
    print(f"Cosine Similarity: {cos_sim.item():.4f}")
 if __name__ == "__main__":
    test()