diff --git a/label_studio_ml/examples/nemo/asr.py b/label_studio_ml/examples/nemo/asr.py index 71fbb074d..d213645e2 100644 --- a/label_studio_ml/examples/nemo/asr.py +++ b/label_studio_ml/examples/nemo/asr.py @@ -12,7 +12,7 @@ class NemoASR(LabelStudioMLBase): - def __init__(self, model_name='QuartzNet15x5Base-En', **kwargs): + def __init__(self, model_name='stt_hi_conformer_ctc_medium', **kwargs): super(NemoASR, self).__init__(**kwargs) # Find TextArea control tag and bind ASR model to it diff --git a/label_studio_ml/examples/openai/docker-compose.yml b/label_studio_ml/examples/openai/docker-compose.yml index c379a3220..728836046 100644 --- a/label_studio_ml/examples/openai/docker-compose.yml +++ b/label_studio_ml/examples/openai/docker-compose.yml @@ -1,14 +1,25 @@ version: "3.8" services: - server: - container_name: server + label-studio-ml-backend: build: . + container_name: ls-ml-backend environment: - - LABEL_STUDIO_ML_BACKEND_V2=true - - LOG_LEVEL=DEBUG - - OPENAI_API_KEY= + - PYTHONUNBUFFERED=1 ports: - "9090:9090" volumes: - - "./prompt.txt:/app/prompt.txt" \ No newline at end of file + - .:/app + command: python openai_predictor.py + + label-studio: + image: heartexlabs/label-studio:latest + container_name: label-studio + environment: + - LABEL_STUDIO_ML_BACKENDS=ml_backend:9090 + ports: + - "8080:8080" + depends_on: + - label-studio-ml-backend + volumes: + - label-studio-data:/label-studio/data diff --git a/label_studio_ml/examples/openai/openai_predictor.py b/label_studio_ml/examples/openai/openai_predictor.py index d6546dd8d..334a7943f 100644 --- a/label_studio_ml/examples/openai/openai_predictor.py +++ b/label_studio_ml/examples/openai/openai_predictor.py @@ -1,95 +1,40 @@ -import os -import openai -import difflib -import logging - from label_studio_ml.model import LabelStudioMLBase +import logging +from typing import List, Dict, Optional +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM -logger = logging.getLogger(__name__) - -openai.api_key = os.environ['OPENAI_API_KEY'] - - -class OpenAIPredictor(LabelStudioMLBase): - DEFAULT_PROMPT = os.path.join(os.path.dirname(__file__), 'prompt.txt') +class GPTIndicBackend(LabelStudioMLBase): def __init__(self, **kwargs): - # don't forget to initialize base class... - super(OpenAIPredictor, self).__init__(**kwargs) - - # Parsed label config contains only one output of type - assert len(self.parsed_label_config) == 1 - self.from_name, self.info = list(self.parsed_label_config.items())[0] - assert self.info['type'] == 'Choices' - - # the model has only one textual input - assert len(self.info['to_name']) == 1 - assert len(self.info['inputs']) == 1 - assert self.info['inputs'][0]['type'] == 'Text' - self.to_name = self.info['to_name'][0] - self.value = self.info['inputs'][0]['value'] - self.labels = self.info['labels'] - - self.openai_model = kwargs.get('model', 'gpt-3.5-turbo') - self.openai_max_tokens = int(kwargs.get('max_tokens', 40)) - self.openai_temperature = float(kwargs.get('temperature', 0.5)) - self.openai_prompt = kwargs.get('prompt', self.DEFAULT_PROMPT) - if os.path.isfile(self.openai_prompt): - with open(self.openai_prompt) as f: - self.openai_prompt = f.read() - - logger.debug( - f'Initialize OpenAI API with the following parameters:' - f' model={self.openai_model}, max_tokens={self.openai_max_tokens}, temperature={self.openai_temperature},' - f' prompt={self.openai_prompt}') - - def _get_prompt(self, task_data): - if os.path.isfile(self.openai_prompt): - # Read the prompt from the file - # that allows changing the prompt without restarting the server - # use it only for development - with open(self.openai_prompt) as f: - prompt = f.read() - else: - prompt = self.openai_prompt - return prompt.format(labels=self.labels, **task_data) - - def _get_predicted_label(self, task_data): - # Create a prompt for the OpenAI API - prompt = self._get_prompt(task_data) - # Call OpenAI's API to create a chat completion using the GPT-3 model - response = openai.ChatCompletion.create( - model=self.openai_model, - messages=[ - {"role": "user", "content": prompt} # The 'user' role is assigned to the prompt - ], - max_tokens=self.openai_max_tokens, # Maximum number of tokens in the response is set to 40 - n=1, # We only want one response - stop=None, # There are no specific stop sequences - temperature=self.openai_temperature, # The temperature parameter affects randomness in the output. Lower values (like 0.5) make the output more deterministic. - ) - logger.debug(f'OpenAI response: {response}') - # Extract the response text from the ChatCompletion response - response_text = response.choices[0].message['content'].strip() + # Initialization for the ML backend + super(GPTIndicBackend, self).__init__(**kwargs) - # Extract the matched labels from the response text - matched_labels = [] - for pred in response_text.split("\n"): - scores = list(map(lambda l: difflib.SequenceMatcher(None, pred, l).ratio(), self.labels)) - matched_labels.append(self.labels[scores.index(max(scores))]) - - # Return the input_text along with the identified sentiment - return matched_labels + # Load the pre-trained tokenizer and model from HuggingFace + self.tokenizer = AutoTokenizer.from_pretrained("aashay96/indic-gpt") + self.model = AutoModelForCausalLM.from_pretrained("aashay96/indic-gpt") def predict(self, tasks, **kwargs): predictions = [] + for task in tasks: - predicted_labels = self._get_predicted_label(task['data']) - result = [{ - 'from_name': self.from_name, - 'to_name': self.to_name, - 'type': 'choices', - 'value': {'choices': predicted_labels} - }] - predictions.append({'result': result, 'score': 1.0}) + # Extract prompt from the task data + prompt_text = task['data']['prompt'] + inputs = self.tokenizer.encode(prompt_text, return_tensors="pt") + + # Generate the response using the model + outputs = self.model.generate(inputs, max_length=100) + response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + + # Structure the prediction result + predictions.append({ + 'result': [{ + 'from_name': 'instruction', + 'to_name': 'prompt', + 'type': 'textarea', + 'value': {'text': [response_text[len(prompt_text):]]}, + }], + 'score': 1.0 # Confidence score + }) + return predictions diff --git a/label_studio_ml/examples/openai/requirements.txt b/label_studio_ml/examples/openai/requirements.txt index b499e1515..208c745c3 100644 --- a/label_studio_ml/examples/openai/requirements.txt +++ b/label_studio_ml/examples/openai/requirements.txt @@ -1,4 +1,4 @@ gunicorn==20.1.0 label-studio-ml>=1.0.9 rq==1.10.1 -openai==0.27.4 \ No newline at end of file +transformers diff --git a/label_studio_ml/examples/segment_anything_model/Dockerfile b/label_studio_ml/examples/segment_anything_model/Dockerfile index 07e0b5295..bce222bde 100644 --- a/label_studio_ml/examples/segment_anything_model/Dockerfile +++ b/label_studio_ml/examples/segment_anything_model/Dockerfile @@ -25,7 +25,7 @@ COPY * /app/ -ENV ACCESS_TOKEN=0c5e516d37ed2bc1d11ff5fc59ebaf5e0f756386 +ENV ACCESS_TOKEN=be24dfbee45f8916fc2fee2d6f71da1dc9d5f109 RUN pip install opencv-python