dataset cards #3140
Unanswered
chrissny88
asked this question in
Q&A
dataset cards
#3140
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
I have just deployed my first model on Hugging Face and it seems to be working well , but I keep encountering an error when trying to check data in the dataset cards. Can anyone help me solve this issue?
here is my code:
✅ Load the trained model safely
--
| MODEL_PATH = "catboost_pipeline_model.joblib"
| if os.path.exists(MODEL_PATH):
| viral_load_suppression_predictor = joblib.load(MODEL_PATH)
| else:
| raise FileNotFoundError("Error: Model file 'catboost_pipeline_model.joblib' not found!")
|
| # ✅ Define repository directory name based on Hugging Face repo
| REPO_DIR = "Viral-load-suppression-mlops-logs"
|
| # ✅ Prepare logging functionality
| log_folder = Path("logs/")
| log_folder.mkdir(parents=True, exist_ok=True) # Ensure logs folder exists
| log_file = log_folder / "predictions_log.jsonl" # JSONL format for Hugging Face compatibility
|
| # ✅ Hugging Face dataset commit scheduler
| scheduler = CommitScheduler(
| repo_id=REPO_DIR,
| repo_type="dataset",
| folder_path=log_folder,
| path_in_repo="data",
| every=2
| )
|
| # ✅ Function to count VL switches
| def count_switches(viral_loads):
| if len(viral_loads) <= 1 or viral_loads.isnull().all():
| return np.nan if viral_loads.isnull().all() else 0
| states = viral_loads.apply(lambda vl: 'suppressed' if vl < 200 else 'unsuppressed').dropna()
| changes = states != states.shift()
| return max(changes.sum() - 1, 0)
|
| # ✅ Compute Effective Suppression Time
| def calc_effective_suppression_time(x):
| start_date = None
| total_suppression_time = pd.Timedelta(days=0)
|
| for _, row in x.iterrows():
| if row["cbs_Viral_Load"] < 200 and start_date is None:
| start_date = row["cbs_Viral_load_collection_date"]
| elif row["cbs_Viral_Load"] >= 200 and start_date is not None:
| if pd.notnull(row["cbs_Viral_load_collection_date"]) and pd.notnull(start_date):
| total_suppression_time += row["cbs_Viral_load_collection_date"] - start_date
| start_date = None
|
| if total_suppression_time.days == 0 and (x["cbs_Viral_Load"] < 200).all():
| total_suppression_time = (x["cbs_Viral_load_collection_date"].iloc[-1] - x["cbs_Viral_load_collection_date"].iloc[0])
|
| return total_suppression_time.days
|
| # ✅ Process Patient History
| def process_patient_history(patient_id, history_df, arv_start_date):
| print(f"📌 Retrieving history for patient: {patient_id}")
|
| patient_history = history_df[history_df["cbs_Index_UPID"] == patient_id]
| if patient_history.empty:
| print("❌ Patient does not exist in the dataset.")
| return None
|
| patient_history["cbs_Viral_load_collection_date"] = pd.to_datetime(patient_history["cbs_Viral_load_collection_date"], errors="coerce")
| patient_history = patient_history.sort_values("cbs_Viral_load_collection_date")
|
| switches = count_switches(patient_history["cbs_Viral_Load"])
| suppressed_counts = (patient_history["cbs_Viral_Load"] < 200).sum()
| unsuppressed_counts = (patient_history["cbs_Viral_Load"] >= 200).sum()
|
| first_suppression = patient_history[patient_history["cbs_Viral_Load"] < 200]["cbs_Viral_load_collection_date"].min()
|
| if pd.notnull(first_suppression) and pd.notnull(arv_start_date):
| time_to_first_suppression = (first_suppression - arv_start_date).days
| if time_to_first_suppression < 0:
| time_to_first_suppression = np.nan
| else:
| time_to_first_suppression = np.nan
|
| effective_suppression_time = calc_effective_suppression_time(patient_history)
|
| return {
| "VL_switches": switches,
| "effective_suppression_time": effective_suppression_time,
| "suppressed_counts": suppressed_counts,
| "unsuppressed_counts": unsuppressed_counts,
| "Time_to_First_Suppression": time_to_first_suppression
| }
|
| # ✅ Define the dataset logging path inside the repo
| repo_dataset_path = Path("dataset")
| repo_dataset_path.mkdir(parents=True, exist_ok=True) # Ensure dataset folder exists
|
| log_file_path = repo_dataset_path / "train.jsonl"
|
| # ✅ Function to log and push predictions to Hugging Face Dataset
| def log_prediction(data, prediction):
| """
| Logs the prediction data to a JSONL file and commits it to Hugging Face Dataset.
| """
|
| # ✅ Convert NaN values to None before logging (JSON does not support NaN)
| sanitized_data = {
| k: (v if not isinstance(v, float) or not np.isnan(v) else None) for k, v in data.items()
| }
|
| log_entry = {
| "timestamp": datetime.now().isoformat(),
| **sanitized_data,
| "Prediction": prediction
| }
|
| try:
| # ✅ Ensure the data is JSON serializable
| json_entry = json.dumps(log_entry)
| except TypeError as e:
| print(f"❌ Error converting data to JSON: {e}")
| return
|
| # ✅ Append log entry to JSONL file
| try:
| with open(log_file, "a") as f:
| f.write(json_entry + "\n") # Ensure new line for each entry
| print(f"✅ Prediction logged successfully in {log_file}")
| except Exception as e:
| print(f"❌ Error writing to log file: {e}")
| return
|
| # ✅ Commit log file to Hugging Face Dataset every 2 predictions
| try:
| scheduler.commit()
| except Exception as e:
| print(f"❌ Error committing log to Hugging Face Dataset: {e}")
|
|
| # ✅ Predict Viral Load Suppression
| def predict_viral_load(cbs_Index_UPID, cbs_Date_debut_ARVs, cbs_gender, Province, cbs_stable, cbs_stable_category, cbs_Client_TPT_outcome):
| print(f"📌 Received input: {cbs_Index_UPID}, {cbs_Date_debut_ARVs}, {cbs_gender}, {Province}, {cbs_stable}, {cbs_stable_category}, {cbs_Client_TPT_outcome}")
|
| DATA_PATH = "processed_cbs_data.csv"
| if not os.path.exists(DATA_PATH):
| print("❌ Patient history file missing.")
| return -1
|
| history_df = pd.read_csv(DATA_PATH, usecols=["cbs_Index_UPID", "cbs_Viral_Load", "cbs_Viral_load_collection_date"])
| print(f"✅ Loaded dataset, shape: {history_df.shape}")
|
| try:
| art_start_date = pd.to_datetime(cbs_Date_debut_ARVs, errors='coerce')
| if pd.notnull(art_start_date):
| total_time_on_ART = (datetime.now() - art_start_date).days # ✅ Fixed Here
| else:
| total_time_on_ART = 0
| except Exception as e:
| print(f"❌ Error calculating total_time_on_ART: {e}")
| total_time_on_ART = 0
|
| print(f"✅ Computed total_time_on_ART: {total_time_on_ART}")
|
| patient_features = process_patient_history(cbs_Index_UPID, history_df, art_start_date)
| if patient_features is None:
| return -1
|
| print("\n🔎 Calculated Features:")
| print(f"🟢 VL Switches: {patient_features['VL_switches']}")
| print(f"🟢 Effective Suppression Time: {patient_features['effective_suppression_time']} days")
| print(f"🟢 Suppressed Counts: {patient_features['suppressed_counts']}")
| print(f"🟢 Unsuppressed Counts: {patient_features['unsuppressed_counts']}")
| print(f"🟢 Time to First Suppression: {patient_features['Time_to_First_Suppression']} days")
| print(f"🟢 Total Time on ART: {total_time_on_ART} days\n")
|
| sample = {
| 'cbs_stable': cbs_stable,
| 'cbs_stable_category': cbs_stable_category,
| 'cbs_Client_TPT_outcome': cbs_Client_TPT_outcome,
| 'Province': Province,
| 'cbs_gender': cbs_gender,
| 'VL_switches': patient_features["VL_switches"],
| 'effective_suppression_time': patient_features["effective_suppression_time"],
| 'suppressed_counts': patient_features["suppressed_counts"],
| 'unsuppressed_counts': patient_features["unsuppressed_counts"],
| 'Time_to_First_Suppression': patient_features["Time_to_First_Suppression"],
| 'total_time_on_ART': total_time_on_ART
| }
|
| data_point = pd.DataFrame([sample])
| data_point.fillna(0, inplace=True)
|
| print(f"✅ Prepared input for model: {data_point}")
|
| try:
| prediction = viral_load_suppression_predictor.predict(data_point).tolist()[0]
| print(f"✅ Prediction made: {prediction}")
| except Exception as e:
| print(f"❌ Model prediction failed: {e}")
| return -1
|
| return prediction
|
| # ✅ Gradio Interface
| demo = gr.Interface(
| fn=predict_viral_load,
| inputs=[
| gr.Textbox(label="Index UPID"),
| gr.Textbox(label="ART Start Date (YYYY-MM-DD)", placeholder="YYYY-MM-DD"),
| gr.Radio(["male", "female"], label="Gender"),
| gr.Dropdown(["east", "west", "kigali city", "south", "north"], label="Province"),
| gr.Radio([0, 1], label="Stable"),
| gr.Dropdown(["stable_at_3_months", "stable_at_6_months"], label="Stable Category"),
| gr.Dropdown(["completed tpt", "therapy in progress"], label="Client TPT Outcome")
| ],
| outputs=gr.Number(label="Prediction (0 = Suppressed, 1 = Unsuppressed, -1 = Patient Not Found)"),
| title="Viral Load Suppression Prediction",
| description="Predict the Viral Load Suppression based on social demographic and health information.",
| allow_flagging="auto",
| concurrency_limit=8
| )
|
| demo.queue()
| demo.launch(share=True, debug=True)
the error
Error code: FeaturesError
Exception: ArrowInvalid
Message: JSON parse error: Invalid value. in row 0
Traceback: Traceback (most recent call last):
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py", line 160, in _generate_tables
df = pandas_read_json(f)
File "/src/services/worker/.venv/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py", line 38, in pandas_read_json
return pd.read_json(path_or_buf, **kwargs)
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 815, in read_json
return json_reader.read()
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1025, in read
obj = self._get_object_parser(self.data)
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1051, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1187, in parse
self._parse()
File "/src/services/worker/.venv/lib/python3.9/site-packages/pandas/io/json/_json.py", line 1403, in _parse
ujson_loads(json, precise_float=self.precise_float), dtype=None
ValueError: Expected object or value
Beta Was this translation helpful? Give feedback.
All reactions