update app.py file

lyndanajjar · lyndanajjar · commit b0120dd07f07 · 2023-11-21T23:55:41.000+01:00
Signed-off-by: lyndanajjar &lt;lyndanajjar15@gmail.com&gt;
diff --git a/src/mpt-7B-inference/app.py b/src/mpt-7B-inference/app.py
@@ -1,31 +1,36 @@
 # app.py
 from flask import Flask, request, jsonify
-from inference import generate, load_model_from_hub
-
-
+from inference import generate, load_model_and_tokenizer ,format_prompt  
 
 app = Flask(__name__)
 
-# Specify the folder we want to save the downloaded model
-destination_folder = "models"
-
-# Load the mpt-7b-chat model from the Hugging Face Model Hub
+# Load the mpt-7b-chat model and tokenizer from the Hugging Face Model Hub
 model_name = "mosaicml/mpt-7b-chat"
-llm = load_model_from_hub(model_name)
+llm, tokenizer = load_model_and_tokenizer(model_name, trust_remote_code=True)  # Update model loading
 
-
-# Use the model path in the generate function
+# Use the model and tokenizer in the generate function
 @app.route('/predict', methods=['POST'])
 def predict():
     data = request.json
     user_prompt = data.get('user_prompt')
 
-# Update the call to use the loaded model directly
-    assistant_response = generate(llm, user_prompt)
+    # Update the call to use the loaded model and tokenizer directly
+    generation_config = {
+        "temperature": 0.2,
+        "top_k": 0,
+        "top_p": 0.9,
+        "repetition_penalty": 1.0,
+        "max_new_tokens": 512,
+    }
+    
+    # Format the prompt using the system prompt
+    system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
+    prompt = format_prompt(system_prompt, user_prompt)
+    
+    # Generate the assistant's response
+    assistant_response = generate(llm, tokenizer, generation_config, prompt)
 
     return jsonify({'assistant_response': assistant_response})
 
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)
-
-