Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ RUN pip wheel --wheel-dir=/root/.cache/pip/wheels -r /tmp/requirements.txt
FROM python:3.11-slim
WORKDIR /app

COPY save_model.py spliting.py requirements.txt ./
COPY requirements.txt ./
COPY --from=wheel-builder /root/.cache/pip/wheels /root/.cache/pip/wheels
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
RUN pip install --no-index --find-links=/root/.cache/pip/wheels -r requirements.txt

COPY save_model.py ./
RUN python3 save_model.py

COPY app.py ./
COPY app.py spliting.py model.py merge.py ./

CMD uvicorn app:app --host 0.0.0.0 --port 8080 --log-level warning
40 changes: 25 additions & 15 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,42 @@
from spliting import split
from fastapi import FastAPI
import torch
from transformers import pipeline

from model import model
from spliting import split
from merge import segmentize_and_merge

app = FastAPI(
title="Модуль разбиения текстов вакансий",
description="Решение представляет собой модуль для разбиения вакансий на должностные обязанности, "
"требования к соискателю, условия труда и примечания. Модуль может с легкостью встраиваться "
"в информационные системы агрегации с целью повышения качества оформления предлагаемых объявлений."
description=(
"Решение представляет собой модуль для разбиения вакансий на должностные обязанности, "
"требования к соискателю, условия труда и примечания. Модуль может с легкостью встраиваться "
"в информационные системы агрегации с целью повышения качества оформления предлагаемых объявлений."
)
)

device = torch.cuda.current_device() if torch.cuda.is_available() and torch.cuda.mem_get_info()[0] >= 2*1024**3 else -1
model = pipeline("text-classification", "extractor_model", device=device)


@app.get("/", description="Принимает на вход текст вакансии, возвращает сгруппированные должностные обязанности, "
"требования к соискателю, условия труда и примечания.")
async def index(text: str) -> dict[str, str]:
@app.get(
"/",
description="Принимает на вход текст вакансии, возвращает сгруппированные должностные обязанности, "
"требования к соискателю, условия труда и примечания."
)
async def index(text: str, merged: bool = False) -> dict[str, str]:
result = {"responsibilities": "",
"requirements": "",
"terms": "",
"notes": ""}

text = text.replace("\t", " ").replace("\r", "")
sentences = split(text)
predicts = [predict["label"] for predict in model.predict(sentences)]
for sentence, label in zip(sentences, predicts):

if merged:
segments = segmentize_and_merge(text)
else:
sentences = split(text)
predicts = [predict["label"] for predict in model.predict(sentences)]
segments = zip(sentences, predicts)

for sentence, label in segments:
result[label.lower()] += sentence + " "
for key in result.keys():
result[key] = result[key].strip().replace(" ", " ")

return result
95 changes: 95 additions & 0 deletions merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from model import model
from spliting import split


def make_segments(text):
return [
{
'text': segment,
'scores': {
predict['label']: predict['score']
for predict in model(segment, top_k=None)
}
}
for segment in split(text)
]


def merge_segments(segment1, segment2):
segment = segment1['text'] + ' ' + segment2['text']
return {
'text': segment,
'scores': {
predict['label']: predict['score']
for predict in model(segment, top_k=None)
}
}


def merge_by_sim(segments, bottom_threshold, top_threshold):
i = 0
while i != len(segments) - 1:
conf_1 = max(segments[i]['scores'].values())
conf_2 = max(segments[i + 1]['scores'].values())

try:
merged = merge_segments(segments[i], segments[i + 1])
except RuntimeError:
i += 1
continue

conf_m = max(merged['scores'].values())
if (
(conf_m > top_threshold) and
(conf_1 < bottom_threshold or conf_2 < bottom_threshold)
):
segments[i] = merged
segments.pop(i + 1)
else:
i += 1


def merge_by_same(segments):
i = 0
while i != len(segments) - 1:
try:
merged = merge_segments(segments[i], segments[i + 1])
except RuntimeError:
i += 1
continue

conf = (
max(segments[i]['scores'], key=segments[i]['scores'].get) ==
max(segments[i + 1]['scores'], key=segments[i + 1]['scores'].get)
)
if conf:
segments[i] = merged
segments.pop(i + 1)
else:
i += 1


def set_low_as(segments, as_, threshold):
for i in range(len(segments)):
cls = max(segments[i]['scores'], key=segments[i]['scores'].get)
if segments[i]['scores'][cls] < threshold:
segments[i]['scores'][as_] = threshold + 0.1


def segmentize_and_merge(text):
segments = make_segments(text)

merge_by_sim(segments, 0.93, 0.96)
merge_by_same(segments)

set_low_as(segments, 'notes', 0.56)

result = [
[
segment['text'],
max(segment['scores'], key=segment['scores'].get)
]
for segment in segments
]

return result
11 changes: 11 additions & 0 deletions model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import torch
from transformers import pipeline


device = (
torch.cuda.current_device()
if torch.cuda.is_available() and torch.cuda.mem_get_info()[0] >= 2*1024**3
else -1
)

model = pipeline("text-classification", "extractor_model", device=device)