Skip to content

Commit 4229a67

Browse files
Merge pull request #1 from Sedimark/dev
update data formatter component
2 parents 77ca9d7 + e506f2a commit 4229a67

File tree

1 file changed

+46
-76
lines changed

1 file changed

+46
-76
lines changed

InteroperabilityEnabler/utils/data_formatter.py

Lines changed: 46 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -12,45 +12,41 @@
1212
import pandas as pd
1313

1414

15-
def data_to_dataframe(file_path):
15+
def data_to_dataframe(data):
1616
"""
17-
Read data from different file types (xls, xlsx, csv, json, jsonld) and
18-
convert them into a pandas DataFrame.
17+
Convert data from file path or raw JSON/JSON-LD into a flattened pandas DataFrame.
1918
2019
Args:
21-
file_path (str): The path to the data file.
20+
data (str | dict | list): Path to a data file or a JSON/JSON-LD object.
2221
23-
Return:
24-
Pandas DataFrame.
22+
Returns:
23+
pd.DataFrame: Flattened data as a DataFrame.
2524
"""
2625
df = None
2726
try:
28-
if file_path.endswith(".xls") or file_path.endswith(".xlsx"):
29-
df = pd.read_excel(file_path)
30-
elif file_path.endswith(".csv"):
31-
df = pd.read_csv(file_path)
32-
elif file_path.endswith(".json") or file_path.endswith(".jsonld"):
33-
with open(file_path, "r", encoding="utf-8") as file:
34-
json_data = json.load(file)
35-
36-
# Handle JSON/JSON-LD data specifically
37-
if isinstance(json_data, list):
38-
entities = json_data
39-
else:
40-
entities = json_data.get(
41-
"@graph", [json_data]
42-
) # Handle as a list of entities
43-
44-
# Flatten the entities
45-
flattened_entities = [flatten_entity(entity) for entity in entities]
46-
df = pd.DataFrame(flattened_entities)
27+
if isinstance(data, str):
28+
# Handle file path
29+
if data.endswith(".xls") or data.endswith(".xlsx"):
30+
df = pd.read_excel(data)
31+
elif data.endswith(".csv"):
32+
df = pd.read_csv(data)
33+
elif data.endswith(".json") or data.endswith(".jsonld"):
34+
with open(data, "r", encoding="utf-8") as file:
35+
json_data = json.load(file)
36+
entities = json_data if isinstance(json_data, list) else json_data.get("@graph", [json_data])
37+
df = pd.DataFrame([flatten_dict(e) for e in entities])
38+
df.reset_index(drop=True, inplace=True)
39+
else:
40+
raise ValueError("Unsupported file format. Must be .xls, .xlsx, .csv, .json, or .jsonld")
41+
elif isinstance(data, (dict, list)):
42+
# Handle raw JSON or JSON-LD object directly
43+
entities = data if isinstance(data, list) else data.get("@graph", [data])
44+
df = pd.DataFrame([flatten_dict(e) for e in entities])
45+
df.reset_index(drop=True, inplace=True)
4746
else:
48-
raise ValueError(
49-
"Unsupported file format. Supported formats are xls, xlsx, json, jsonld, and csv."
50-
)
47+
raise ValueError("Unsupported input type. Must be file path or JSON object.")
5148
except Exception as e:
52-
print(f"Error processing file {file_path}: {e}")
53-
49+
print(f"Error processing data: {e}")
5450
return df
5551

5652

@@ -60,63 +56,37 @@ def flatten_dict(d, parent_key="", sep=".", preserve_keys=None):
6056
6157
Args:
6258
d (dict): The dictionary to flatten.
63-
parent_key (str): The base key for recursion, used to create hierarchical keys.
64-
sep (str): The separator for nested keys (default is '.').
65-
preserve_keys (list): Keys whose values should not be flattened (default is None).
59+
parent_key (str): Prefix for keys during recursion.
60+
sep (str): Separator used for key hierarchy.
61+
preserve_keys (list): Keys whose values should not be flattened.
6662
67-
Return:
68-
A flattened dictionary with keys representing the hierarchy.
63+
Returns:
64+
dict: A flattened dictionary.
6965
"""
7066
if preserve_keys is None:
71-
preserve_keys = ["coordinates", "@context"] # Keys to preserve as lists
67+
preserve_keys = ["coordinates", "@context"]
68+
7269
items = []
7370
for k, v in d.items():
74-
# Create the new key by appending current key to parent_key
7571
new_key = f"{parent_key}{sep}{k}" if parent_key else k
76-
# Recursively flatten if value is a dictionary
72+
7773
if isinstance(v, dict):
7874
if k in preserve_keys:
79-
# Preserve the dictionary as-is if key is in preserve_keys
8075
items.append((new_key, v))
8176
else:
82-
items.extend(
83-
flatten_dict(
84-
v, new_key, sep=sep, preserve_keys=preserve_keys
85-
).items()
86-
)
87-
elif isinstance(v, list) and k in preserve_keys:
88-
# Preserve the list as-is
89-
items.append((new_key, v))
77+
items.extend(flatten_dict(v, new_key, sep=sep, preserve_keys=preserve_keys).items())
78+
9079
elif isinstance(v, list):
91-
# Flatten lists unless the key is in preserve_keys
92-
for i, item in enumerate(v):
93-
if isinstance(item, dict):
94-
# Flatten each dictionary inside the list
95-
items.extend(
96-
flatten_dict(
97-
item,
98-
f"{new_key}[{i}]",
99-
sep=sep,
100-
preserve_keys=preserve_keys,
101-
).items()
102-
)
103-
else:
104-
# Handle primitive values in the list
105-
items.append((f"{new_key}[{i}]", item))
106-
# Handle all other key-value pairs
80+
if k in preserve_keys:
81+
items.append((new_key, v))
82+
else:
83+
for i, item in enumerate(v):
84+
if isinstance(item, dict):
85+
items.extend(flatten_dict(item, f"{new_key}[{i}]", sep=sep, preserve_keys=preserve_keys).items())
86+
else:
87+
items.append((f"{new_key}[{i}]", item))
88+
10789
else:
10890
items.append((new_key, v))
109-
return dict(items)
110-
111-
112-
def flatten_entity(entity):
113-
"""
114-
Flattens a single NGSI-LD entity by applying flatten_dict.
11591

116-
Args:
117-
entity (dict): The NGSI-LD entity to flatten.
118-
119-
Returns:
120-
dict: A flattened version of the entity.
121-
"""
122-
return flatten_dict(entity)
92+
return dict(items)

0 commit comments

Comments
 (0)