Amsterdam-AI-Team · kapsterdam · Jun 4, 2025
diff --git a/taalherkenning/README.md b/taalherkenning/README.md
@@ -0,0 +1,36 @@
+# FastText Language Detection Script
+
+This repository contains a simple Python script (`detect_language.py`) that allows a user to input text via the terminal and uses Facebook’s fastText library to detect the language of the input. It will print the detected language code and a confidence score.
+
+---
+
+## Contents
+
+- `detect_language.py`  
+  The main script. It loads (or downloads, if missing) the pre-trained fastText language identification model (`lid.176.ftz`) and accepts user input to identify its language.
+
+- `requirements.txt`  
+  Lists the Python dependency (`fasttext`) needed for the script.
+
+- `README.md`  
+  Instructions on how to set up and run the script.
+
+---
+
+## Prerequisites
+
+- Python 3.7 or higher
+- Internet connection (only the first time to download the model if it’s not already present)
+
+---
+
+## Setup
+
+1. **Clone or download this repository** to your local machine.
+
+2. **Create and activate a virtual environment** (recommended):
+
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate       # macOS / Linux
+   venv\Scripts\activate.bat      # Windows
diff --git a/taalherkenning/main.py b/taalherkenning/main.py
@@ -0,0 +1,84 @@
+import os
+import urllib.request
+import fasttext
+import numpy as np
+
+MODEL_PATH = "lid.176.ftz"
+
+# -------------------------------
+# Patch np.array to ignore copy=False
+# -------------------------------
+
+# Keep a reference to the original np.array
+_original_np_array = np.array
+
+
+def _patched_np_array(obj, *args, copy=True, **kwargs):
+    """
+    Replacement for np.array that drops the 'copy=False' request
+    so that fastText.predict can wrap probabilities without error.
+    """
+    # Always let np.array decide whether to copy or not (default behavior).
+    if "copy" in kwargs:
+        kwargs.pop("copy")
+    return _original_np_array(obj, *args, **kwargs)
+
+
+# Apply the patch
+np.array = _patched_np_array
+
+# -------------------------------
+# Model Download & Load
+# -------------------------------
+
+
+def download_model(model_path: str):
+    """
+    Downloads the fastText language identification model if not present.
+    """
+    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
+    print(
+        f"Downloading language identification model from:\n{url}\n(This may take a moment...)")
+    urllib.request.urlretrieve(url, model_path)
+    print("Download complete.")
+
+
+def load_language_model():
+    """
+    Loads the fastText language identification model, downloading it if necessary.
+    """
+    if not os.path.isfile(MODEL_PATH):
+        download_model(MODEL_PATH)
+    return fasttext.load_model(MODEL_PATH)
+
+# -------------------------------
+# Main Loop
+# -------------------------------
+
+
+def main():
+    model = load_language_model()
+    print("FastText language identification model loaded.\n")
+
+    while True:
+        user_input = input("Enter text (or type 'exit' to quit):\n> ").strip()
+        if user_input.lower() == "exit":
+            print("Exiting.")
+            break
+        if not user_input:
+            print("Please enter some text or 'exit' to quit.\n")
+            continue
+
+        # fastText expects a newline-terminated string for prediction
+        labels, probabilities = model.predict(
+            user_input.replace("\n", " "), k=1)
+        # labels come as '__label__xx'; strip the prefix
+        lang_code = labels[0].replace("__label__", "")
+        confidence = probabilities[0]
+
+        print(
+            f"Detected language: {lang_code} (confidence: {confidence:.4f})\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/taalherkenning/requirements.txt b/taalherkenning/requirements.txt
@@ -0,0 +1,2 @@
+fasttext==0.9.2
+numpy>=1.23.0