diff --git a/Dockerfile b/Dockerfile index 28a90d1..d6a2728 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,14 @@ RUN pip wheel --wheel-dir=/root/.cache/pip/wheels -r /tmp/requirements.txt FROM python:3.11-slim WORKDIR /app -COPY save_model.py spliting.py requirements.txt ./ +COPY requirements.txt ./ COPY --from=wheel-builder /root/.cache/pip/wheels /root/.cache/pip/wheels RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu RUN pip install --no-index --find-links=/root/.cache/pip/wheels -r requirements.txt +COPY save_model.py ./ RUN python3 save_model.py -COPY app.py ./ +COPY app.py spliting.py model.py merge.py ./ CMD uvicorn app:app --host 0.0.0.0 --port 8080 --log-level warning diff --git a/app.py b/app.py index 7098954..7d08085 100644 --- a/app.py +++ b/app.py @@ -1,32 +1,42 @@ from spliting import split from fastapi import FastAPI -import torch -from transformers import pipeline - +from model import model +from spliting import split +from merge import segmentize_and_merge app = FastAPI( title="Модуль разбиения текстов вакансий", - description="Решение представляет собой модуль для разбиения вакансий на должностные обязанности, " - "требования к соискателю, условия труда и примечания. Модуль может с легкостью встраиваться " - "в информационные системы агрегации с целью повышения качества оформления предлагаемых объявлений." + description=( + "Решение представляет собой модуль для разбиения вакансий на должностные обязанности, " + "требования к соискателю, условия труда и примечания. Модуль может с легкостью встраиваться " + "в информационные системы агрегации с целью повышения качества оформления предлагаемых объявлений." + ) ) -device = torch.cuda.current_device() if torch.cuda.is_available() and torch.cuda.mem_get_info()[0] >= 2*1024**3 else -1 -model = pipeline("text-classification", "extractor_model", device=device) - -@app.get("/", description="Принимает на вход текст вакансии, возвращает сгруппированные должностные обязанности, " - "требования к соискателю, условия труда и примечания.") -async def index(text: str) -> dict[str, str]: +@app.get( + "/", + description="Принимает на вход текст вакансии, возвращает сгруппированные должностные обязанности, " + "требования к соискателю, условия труда и примечания." +) +async def index(text: str, merged: bool = False) -> dict[str, str]: result = {"responsibilities": "", "requirements": "", "terms": "", "notes": ""} + text = text.replace("\t", " ").replace("\r", "") - sentences = split(text) - predicts = [predict["label"] for predict in model.predict(sentences)] - for sentence, label in zip(sentences, predicts): + + if merged: + segments = segmentize_and_merge(text) + else: + sentences = split(text) + predicts = [predict["label"] for predict in model.predict(sentences)] + segments = zip(sentences, predicts) + + for sentence, label in segments: result[label.lower()] += sentence + " " for key in result.keys(): result[key] = result[key].strip().replace(" ", " ") + return result diff --git a/merge.py b/merge.py new file mode 100644 index 0000000..96935ee --- /dev/null +++ b/merge.py @@ -0,0 +1,95 @@ +from model import model +from spliting import split + + +def make_segments(text): + return [ + { + 'text': segment, + 'scores': { + predict['label']: predict['score'] + for predict in model(segment, top_k=None) + } + } + for segment in split(text) + ] + + +def merge_segments(segment1, segment2): + segment = segment1['text'] + ' ' + segment2['text'] + return { + 'text': segment, + 'scores': { + predict['label']: predict['score'] + for predict in model(segment, top_k=None) + } + } + + +def merge_by_sim(segments, bottom_threshold, top_threshold): + i = 0 + while i != len(segments) - 1: + conf_1 = max(segments[i]['scores'].values()) + conf_2 = max(segments[i + 1]['scores'].values()) + + try: + merged = merge_segments(segments[i], segments[i + 1]) + except RuntimeError: + i += 1 + continue + + conf_m = max(merged['scores'].values()) + if ( + (conf_m > top_threshold) and + (conf_1 < bottom_threshold or conf_2 < bottom_threshold) + ): + segments[i] = merged + segments.pop(i + 1) + else: + i += 1 + + +def merge_by_same(segments): + i = 0 + while i != len(segments) - 1: + try: + merged = merge_segments(segments[i], segments[i + 1]) + except RuntimeError: + i += 1 + continue + + conf = ( + max(segments[i]['scores'], key=segments[i]['scores'].get) == + max(segments[i + 1]['scores'], key=segments[i + 1]['scores'].get) + ) + if conf: + segments[i] = merged + segments.pop(i + 1) + else: + i += 1 + + +def set_low_as(segments, as_, threshold): + for i in range(len(segments)): + cls = max(segments[i]['scores'], key=segments[i]['scores'].get) + if segments[i]['scores'][cls] < threshold: + segments[i]['scores'][as_] = threshold + 0.1 + + +def segmentize_and_merge(text): + segments = make_segments(text) + + merge_by_sim(segments, 0.93, 0.96) + merge_by_same(segments) + + set_low_as(segments, 'notes', 0.56) + + result = [ + [ + segment['text'], + max(segment['scores'], key=segment['scores'].get) + ] + for segment in segments + ] + + return result diff --git a/model.py b/model.py new file mode 100644 index 0000000..3a6ef4b --- /dev/null +++ b/model.py @@ -0,0 +1,11 @@ +import torch +from transformers import pipeline + + +device = ( + torch.cuda.current_device() + if torch.cuda.is_available() and torch.cuda.mem_get_info()[0] >= 2*1024**3 + else -1 +) + +model = pipeline("text-classification", "extractor_model", device=device)