Skip to content

Commit d176883

Browse files
feat: Add data commons toolkit (#979)
Co-authored-by: Wendong <[email protected]> Co-authored-by: Wendong-Fan <[email protected]>
1 parent fb47b76 commit d176883

File tree

5 files changed

+784
-7
lines changed

5 files changed

+784
-7
lines changed
Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
2+
# Licensed under the Apache License, Version 2.0 (the “License”);
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an “AS IS” BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
14+
import logging
15+
from typing import Any, Dict, List, Optional, Union
16+
17+
from camel.toolkits.base import BaseToolkit
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
class DataCommonsToolkit(BaseToolkit):
23+
r"""A class representing a toolkit for Data Commons.
24+
25+
This class provides methods for querying and retrieving data from the
26+
Data Commons knowledge graph. It includes functionality for:
27+
- Executing SPARQL queries
28+
- Retrieving triples associated with nodes
29+
- Fetching statistical time series data
30+
- Analyzing property labels and values
31+
- Retrieving places within a given place type
32+
- Obtaining statistical values for specific variables and locations
33+
34+
All the data are grabbed from the knowledge graph of Data Commons.
35+
Refer to https://datacommons.org/browser/ for more details.
36+
"""
37+
38+
@staticmethod
39+
def query_data_commons(
40+
query_string: str,
41+
) -> Optional[List[Dict[str, Any]]]:
42+
r"""Query the Data Commons knowledge graph using SPARQL.
43+
44+
Args:
45+
query_string (str): A SPARQL query string.
46+
47+
Returns:
48+
Optional[List[Dict[str, Any]]]: A list of dictionaries, each
49+
representing a node matching the query conditions if success,
50+
(default: :obj:`None`) otherwise.
51+
52+
Note:
53+
- Only supports a limited subset of SPARQL functionality (ORDER BY,
54+
DISTINCT, LIMIT).
55+
- Each variable in the query should have a 'typeOf' condition.
56+
- The Python SPARQL library currently only supports the V1 version
57+
of the API.
58+
59+
Reference:
60+
https://docs.datacommons.org/api/python/query.html
61+
"""
62+
import datacommons
63+
64+
try:
65+
results = datacommons.query(query_string)
66+
67+
processed_results = [
68+
{key: value for key, value in row.items()} for row in results
69+
]
70+
71+
return processed_results
72+
73+
except Exception as e:
74+
logger.error(
75+
f"An error occurred while querying Data Commons: {e!s}"
76+
)
77+
return None
78+
79+
@staticmethod
80+
def get_triples(
81+
dcids: Union[str, List[str]], limit: int = 500
82+
) -> Optional[Dict[str, List[tuple]]]:
83+
r"""Retrieve triples associated with nodes.
84+
85+
Args:
86+
dcids (Union[str, List[str]]): A single DCID or a list of DCIDs
87+
to query.
88+
limit (int): The maximum number of triples per
89+
combination of property and type. (default: :obj:`500`)
90+
91+
Returns:
92+
Optional[Dict[str, List[tuple]]]: A dictionary where keys are
93+
DCIDs and values are lists of associated triples if success,
94+
(default: :obj:`None`) otherwise.
95+
96+
Note:
97+
- The function will raise a ValueError if any of the required
98+
arguments are missing.
99+
- The function will raise a TypeError if the dcids are not a string
100+
or a list of strings.
101+
- The function will raise a ValueError if the limit is not between
102+
1 and 500.
103+
- The function will raise a KeyError if one or more of the provided
104+
DCIDs do not exist in the Data Commons knowledge graph.
105+
- The function will raise an Exception if an unexpected error occurs.
106+
107+
Reference:
108+
https://docs.datacommons.org/api/python/triple.html
109+
"""
110+
import datacommons
111+
112+
try:
113+
result = datacommons.get_triples(dcids, limit)
114+
return result
115+
116+
except Exception as e:
117+
logger.error(f"An error occurred: {e!s}")
118+
return None
119+
120+
@staticmethod
121+
def get_stat_time_series(
122+
place: str,
123+
stat_var: str,
124+
measurement_method: Optional[str] = None,
125+
observation_period: Optional[str] = None,
126+
unit: Optional[str] = None,
127+
scaling_factor: Optional[str] = None,
128+
) -> Optional[Dict[str, Any]]:
129+
r"""Retrieve statistical time series for a place.
130+
131+
Args:
132+
place (str): The dcid of the Place to query for.
133+
stat_var (str): The dcid of the StatisticalVariable.
134+
measurement_method (str, optional): The technique used for
135+
measuring a statistical variable. (default: :obj:`None`)
136+
observation_period (str, optional): The time period over which an
137+
observation is made. (default: :obj:`None`)
138+
scaling_factor (str, optional): Property of statistical variables
139+
indicating factor by which a measurement is multiplied to fit
140+
a certain format. (default: :obj:`None`)
141+
unit (str, optional): The unit of measurement. (default:
142+
:obj:`None`)
143+
144+
Returns:
145+
Optional[Dict[str, Any]]: A dictionary containing the statistical
146+
time series data if success, (default: :obj:`None`) otherwise.
147+
148+
Reference:
149+
https://docs.datacommons.org/api/python/stat_series.html
150+
"""
151+
import datacommons_pandas
152+
153+
try:
154+
result = datacommons_pandas.get_stat_series(
155+
place,
156+
stat_var,
157+
measurement_method,
158+
observation_period,
159+
unit,
160+
scaling_factor,
161+
)
162+
return result
163+
except Exception as e:
164+
logger.error(
165+
f"An error occurred while querying Data Commons: {e!s}"
166+
)
167+
return None
168+
169+
@staticmethod
170+
def get_property_labels(
171+
dcids: Union[str, List[str]], out: bool = True
172+
) -> Optional[Dict[str, List[str]]]:
173+
r"""Retrieves and analyzes property labels for given DCIDs.
174+
175+
Args:
176+
dcids (list): A list of Data Commons IDs (DCIDs) to analyze.
177+
out (bool): Direction of properties to retrieve. (default:
178+
:obj:`True`)
179+
180+
Returns:
181+
Optional[Dict[str, List[str]]]: Analysis results for each DCID if
182+
success, (default: :obj:`None`) otherwise.
183+
184+
Reference:
185+
https://docs.datacommons.org/api/python/property_label.html
186+
"""
187+
import datacommons
188+
189+
try:
190+
result = datacommons.get_property_labels(dcids, out=out)
191+
return result
192+
except Exception as e:
193+
logger.error(
194+
f"An error occurred while analyzing property labels: {e!s}"
195+
)
196+
return None
197+
198+
@staticmethod
199+
def get_property_values(
200+
dcids: Union[str, List[str]],
201+
prop: str,
202+
out: Optional[bool] = True,
203+
value_type: Optional[str] = None,
204+
limit: Optional[int] = None,
205+
) -> Optional[Dict[str, Any]]:
206+
r"""Retrieves and analyzes property values for given DCIDs.
207+
208+
Args:
209+
dcids (list): A list of Data Commons IDs (DCIDs) to analyze.
210+
prop (str): The property to analyze.
211+
value_type (str, optional): The type of the property value to
212+
filter by. Defaults to NONE. Only applicable if the value
213+
refers to a node.
214+
out (bool, optional): The label's direction. (default: :obj:`True`)
215+
(only returning response nodes directed towards the requested
216+
node). If set to False, will only return response nodes
217+
directed away from the request node. (default: :obj:`None`)
218+
limit (int, optional): (≤ 500) Maximum number of values returned
219+
per node. (default: :obj:`datacommons.utils._MAX_LIMIT`)
220+
221+
Returns:
222+
Optional[Dict[str, Any]]: Analysis results for each DCID if
223+
success, (default: :obj:`None`) otherwise.
224+
225+
Reference:
226+
https://docs.datacommons.org/api/python/property_value.html
227+
"""
228+
import datacommons
229+
230+
try:
231+
result = datacommons.get_property_values(
232+
dcids, prop, out, value_type, limit
233+
)
234+
return result
235+
236+
except Exception as e:
237+
logger.error(
238+
f"An error occurred while analyzing property values: {e!s}"
239+
)
240+
return None
241+
242+
@staticmethod
243+
def get_places_in(
244+
dcids: list, place_type: str
245+
) -> Optional[Dict[str, Any]]:
246+
r"""Retrieves places within a given place type.
247+
248+
Args:
249+
dcids (list): A list of Data Commons IDs (DCIDs) to analyze.
250+
place_type (str): The type of the place to filter by.
251+
252+
Returns:
253+
Optional[Dict[str, Any]]: Analysis results for each DCID if
254+
success, (default: :obj:`None`) otherwise.
255+
256+
Reference:
257+
https://docs.datacommons.org/api/python/place_in.html
258+
"""
259+
import datacommons
260+
261+
try:
262+
result = datacommons.get_places_in(dcids, place_type)
263+
return result
264+
265+
except Exception as e:
266+
logger.error(
267+
"An error occurred while retrieving places in a given place "
268+
f"type: {e!s}"
269+
)
270+
return None
271+
272+
@staticmethod
273+
def get_stat_value(
274+
place: str,
275+
stat_var: str,
276+
date: Optional[str] = None,
277+
measurement_method: Optional[str] = None,
278+
observation_period: Optional[str] = None,
279+
unit: Optional[str] = None,
280+
scaling_factor: Optional[str] = None,
281+
) -> Optional[float]:
282+
r"""Retrieves the value of a statistical variable for a given place
283+
and date.
284+
285+
Args:
286+
place (str): The DCID of the Place to query for.
287+
stat_var (str): The DCID of the StatisticalVariable.
288+
date (str, optional): The preferred date of observation in ISO
289+
8601 format. If not specified, returns the latest observation.
290+
(default: :obj:`None`)
291+
measurement_method (str, optional): The DCID of the preferred
292+
measurementMethod value. (default: :obj:`None`)
293+
observation_period (str, optional): The preferred observationPeriod
294+
value. (default: :obj:`None`)
295+
unit (str, optional): The DCID of the preferred unit value.
296+
(default: :obj:`None`)
297+
scaling_factor (str, optional): The preferred scalingFactor value.
298+
(default: :obj:`None`)
299+
300+
Returns:
301+
Optional[float]: The value of the statistical variable for the
302+
given place and date if success, (default: :obj:`None`)
303+
otherwise.
304+
305+
Reference:
306+
https://docs.datacommons.org/api/python/stat_value.html
307+
"""
308+
import datacommons
309+
310+
try:
311+
result = datacommons.get_stat_value(
312+
place,
313+
stat_var,
314+
date,
315+
measurement_method,
316+
observation_period,
317+
unit,
318+
scaling_factor,
319+
)
320+
return result
321+
322+
except Exception as e:
323+
logger.error(
324+
"An error occurred while retrieving the value of a "
325+
f"statistical variable: {e!s}"
326+
)
327+
return None
328+
329+
@staticmethod
330+
def get_stat_all(places: str, stat_vars: str) -> Optional[dict]:
331+
r"""Retrieves the value of a statistical variable for a given place
332+
and date.
333+
334+
Args:
335+
places (str): The DCID IDs of the Place objects to query for.
336+
(Here DCID stands for Data Commons ID, the unique identifier
337+
assigned to all entities in Data Commons.)
338+
stat_vars (str): The dcids of the StatisticalVariables at
339+
https://datacommons.org/browser/StatisticalVariable
340+
341+
Returns:
342+
Optional[dict]: A dictionary with the DCID of the place as the key
343+
and a list of tuples as the value if success, (default:
344+
:obj:`None`) otherwise.
345+
346+
Reference:
347+
https://docs.datacommons.org/api/python/stat_all.html
348+
"""
349+
import datacommons
350+
351+
try:
352+
result = datacommons.get_stat_all(places, stat_vars)
353+
return result
354+
355+
except Exception as e:
356+
logger.error(
357+
"An error occurred while retrieving the value of a "
358+
f"statistical variable: {e!s}"
359+
)
360+
return None

0 commit comments

Comments
 (0)