dataset cards #3140
Unanswered
chrissny88
asked this question in
Q&A
dataset cards
#3140
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
I have just deployed my first model on Hugging Face and it seems to be working well , but I keep encountering an error when trying to check data in the dataset cards. Can anyone help me solve this issue?
here is my code:
✅ Load the trained model safely
--
| MODEL_PATH = "catboost_pipeline_model.joblib"
| if os.path.exists(MODEL_PATH):
| viral_load_suppression_predictor = joblib.load(MODEL_PATH)
| else:
| raise FileNotFoundError("Error: Model file 'catboost_pipeline_model.joblib' not found!")
|
| # ✅ Define repository directory name based on Hugging Face repo
| REPO_DIR = "Viral-load-suppression-mlops-logs"
|
| # ✅ Prepare logging functionality
| log_folder = Path("logs/")
| log_folder.mkdir(parents=True, exist_ok=True) # Ensure logs folder exists
| log_file = log_folder / "predictions_log.jsonl" # JSONL format for Hugging Face compatibility
|
| # ✅ Hugging Face dataset commit scheduler
| scheduler = CommitScheduler(
| repo_id=REPO_DIR,
| repo_type="dataset",
| folder_path=log_folder,
| path_in_repo="data",
| every=2
| )
|
| # ✅ Function to count VL switches
| def count_switches(viral_loads):
| if len(viral_loads) <= 1 or viral_loads.isnull().all():
| return np.nan if viral_loads.isnull().all() else 0
| states = viral_loads.apply(lambda vl: 'suppressed' if vl < 200 else 'unsuppressed').dropna()
| changes = states != states.shift()
| return max(changes.sum() - 1, 0)
|
| # ✅ Compute Effective Suppression Time
| def calc_effective_suppression_time(x):
| start_date = None
| total_suppression_time = pd.Timedelta(days=0)
|
| for _, row in x.iterrows():
| if row["cbs_Viral_Load"] < 200 and start_date is None:
| start_date = row["cbs_Viral_load_collection_date"]
| elif row["cbs_Viral_Load"] >= 200 and start_date is not None:
| if pd.notnull(row["cbs_Viral_load_collection_date"]) and pd.notnull(start_date):
| total_suppression_time += row["cbs_Viral_load_collection_date"] - start_date
| start_date = None
|
| if total_suppression_time.days == 0 and (x["cbs_Viral_Load"] < 200).all():
| total_suppression_time = (x["cbs_Viral_load_collection_date"].iloc[-1] - x["cbs_Viral_load_collection_date"].iloc[0])
|
| return total_suppression_time.days
|
| # ✅ Process Patient History
| def process_patient_history(patient_id, history_df, arv_start_date):
| print(f"📌 Retrieving history for patient: {patient_id}")
|
| patient_history = history_df[history_df["cbs_Index_UPID"] == patient_id]
| if patient_history.empty:
| print("❌ Patient does not exist in the dataset.")
| return None
|
| patient_history["cbs_Viral_load_collection_date"] = pd.to_datetime(patient_history["cbs_Viral_load_collection_date"], errors="coerce")
| patient_history = patient_history.sort_values("cbs_Viral_load_collection_date")
|
| switches = count_switches(patient_history["cbs_Viral_Load"])
| suppressed_counts = (patient_history["cbs_Viral_Load"] < 200).sum()
| unsuppressed_counts = (patient_history["cbs_Viral_Load"] >= 200).sum()
|
| first_suppression = patient_history[patient_history["cbs_Viral_Load"] < 200]["cbs_Viral_load_collection_date"].min()
|
| if pd.notnull(first_suppression) and pd.notnull(arv_start_date):
| time_to_first_suppression = (first_suppression - arv_start_date).days
| if time_to_first_suppression < 0:
| time_to_first_suppression = np.nan
| else:
| time_to_first_suppression = np.nan
|
| effective_suppression_time = calc_effective_suppression_time(patient_history)
|
| return {
| "VL_switches": switches,
| "effective_suppression_time": effective_suppression_time,
| "suppressed_counts": suppressed_counts,
| "unsuppressed_counts": unsuppressed_counts,
| "Time_to_First_Suppression": time_to_first_suppression
| }
|
| # ✅ Define the dataset logging path inside the repo
| repo_dataset_path = Path("dataset")
| repo_dataset_path.mkdir(parents=True, exist_ok=True) # Ensure dataset folder exists
|
| log_file_path = repo_dataset_path / "train.jsonl"
|
| # ✅ Function to log and push predictions to Hugging Face Dataset
| def log_prediction(data, prediction):
| """
| Logs the prediction data to a JSONL file and commits it to Hugging Face Dataset.
| """
|
| # ✅ Convert NaN values to None before logging (JSON does not support NaN)
| sanitized_data = {
| k: (v if not isinstance(v, float) or not np.isnan(v) else None) for k, v in data.items()
| }
|
| log_entry = {
| "timestamp": datetime.now().isoformat(),
| **sanitized_data,
| "Prediction": prediction
| }
|
| try:
| # ✅ Ensure the data is JSON serializable
| json_entry = json.dumps(log_entry)
| except TypeError as e:
| print(f"❌ Error converting data to JSON: {e}")
| return
|
| # ✅ Append log entry to JSONL file
| try:
| with open(log_file, "a") as f:
| f.write(json_entry + "\n") # Ensure new line for each entry
| print(f"✅ Prediction logged successfully in {log_file}")
| except Exception as e:
| print(f"❌ Error writing to log file: {e}")
| return
|
| # ✅ Commit log file to Hugging Face Dataset every 2 predictions
| try:
| scheduler.commit()
| except Exception as e:
| print(f"❌ Error committing log to Hugging Face Dataset: {e}")
|
|
| # ✅ Predict Viral Load Suppression
| def predict_viral_load(cbs_Index_UPID, cbs_Date_debut_ARVs, cbs_gender, Province, cbs_stable, cbs_stable_category, cbs_Client_TPT_outcome):
| print(f"📌 Received input: {cbs_Index_UPID}, {cbs_Date_debut_ARVs}, {cbs_gender}, {Province}, {cbs_stable}, {cbs_stable_category}, {cbs_Client_TPT_outcome}")
|
| DATA_PATH = "processed_cbs_data.csv"
| if not os.path.exists(DATA_PATH):
| print("❌ Patient history file missing.")
| return -1
|
| history_df = pd.read_csv(DATA_PATH, usecols=["cbs_Index_UPID", "cbs_Viral_Load", "cbs_Viral_load_collection_date"])
| print(f"✅ Loaded dataset, shape: {history_df.shape}")
|
| try:
| art_start_date = pd.to_datetime(cbs_Date_debut_ARVs, errors='coerce')
| if pd.notnull(art_start_date):
| total_time_on_ART = (datetime.now() - art_start_date).days # ✅ Fixed Here
| else:
| total_time_on_ART = 0
| except Exception as e:
| print(f"❌ Error calculating total_time_on_ART: {e}")
| total_time_on_ART = 0
|
| print(f"✅ Computed total_time_on_ART: {total_time_on_ART}")
|
| patient_features = process_patient_history(cbs_Index_UPID, history_df, art_start_date)
| if patient_features is None:
| return -1
|
| print("\n🔎 Calculated Features:")
| print(f"🟢 VL Switches: {patient_features['VL_switches']}")
| print(f"🟢 Effective Suppression Time: {patient_features['effective_suppression_time']} days")
| print(f"🟢 Suppressed Counts: {patient_features['suppressed_counts']}")
| print(f"🟢 Unsuppressed Counts: {patient_features['unsuppressed_counts']}")
| print(f"🟢 Time to First Suppression: {patient_features['Time_to_First_Suppression']} days")
| print(f"🟢 Total Time on ART: {total_time_on_ART} days\n")
|
| sample = {
| 'cbs_stable': cbs_stable,
| 'cbs_stable_category': cbs_stable_category,
| 'cbs_Client_TPT_outcome': cbs_Client_TPT_outcome,
| 'Province': Province,
| 'cbs_gender': cbs_gender,
| 'VL_switches': patient_features["VL_switches"],
| 'effective_suppression_time': patient_features["effective_suppression_time"],
| 'suppressed_counts': patient_features["suppressed_counts"],
| 'unsuppressed_counts': patient_features["unsuppressed_counts"],
| 'Time_to_First_Suppression': patient_features["Time_to_First_Suppression"],
| 'total_time_on_ART': total_time_on_ART
| }
|
| data_point = pd.DataFrame([sample])
| data_point.fillna(0, inplace=True)
|
| print(f"✅ Prepared input for model: {data_point}")
|
| try:
| prediction = viral_load_suppression_predictor.predict(data_point).tolist()[0]
| print(f"✅ Prediction made: {prediction}")
| except Exception as e:
| print(f"❌ Model prediction failed: {e}")
| return -1
|
| return prediction
|
| # ✅ Gradio Interface
| demo = gr.Interface(
| fn=predict_viral_load,
| inputs=[
| gr.Textbox(label="Index UPID"),
| gr.Textbox(label="ART Start Date (YYYY-MM-DD)", placeholder="YYYY-MM-DD"),
| gr.Radio(["male", "female"], label="Gender"),
| gr.Dropdown(["east", "west", "kigali city", "south", "north"], label="Province"),
| gr.Radio([0, 1], label="Stable"),
| gr.Dropdown(["stable_at_3_months", "stable_at_6_months"], label="Stable Category"),
| gr.Dropdown(["completed tpt", "therapy in progress"], label="Client TPT Outcome")
| ],
| outputs=gr.Number(label="Prediction (0 = Suppressed, 1 = Unsuppressed, -1 = Patient Not Found)"),
| title="Viral Load Suppression Prediction",
| description="Predict the Viral Load Suppression based on social demographic and health information.",
| allow_flagging="auto",
| concurrency_limit=8
| )
|
| demo.queue()
| demo.launch(share=True, debug=True)
the error
Error code: FeaturesError
Exception: ArrowInvalid
Message: JSON parse error: Invalid value. in row 0
Traceback: Traceback (most recent call last):
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py", line 160, in _generate_tables
df = pandas_read_json(f)
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py", line 38, in pandas_read_json
return pd.read_json(path_or_buf, **kwargs)
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 815, in read_json
return json_reader.read()
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1025, in read
obj = self._get_object_parser(self.data)
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1051, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1187, in parse
self._parse()
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1403, in _parse
ujson_loads(json, precise_float=self.precise_float), dtype=None
ValueError: Expected object or value
Beta Was this translation helpful? Give feedback.
All reactions