Skip to content

Commit 75ef9bb

Browse files
committed
implemented endpoints insights_tarb and insights_wiki
1 parent 120f17f commit 75ef9bb

File tree

12 files changed

+437
-9
lines changed

12 files changed

+437
-9
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### IARI version 4.4.6
2+
3+
- Added exturls to insights endpoint, which is insights_webrx internally

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "Internet Archive Reference Inventory (IARI)"
3-
version = "4.4.10"
3+
version = "4.4.12"
44
description = "API capable of fetching, extracting, transforming and storing reference information from Wikipedia articles, websites and PDFs as structured data."
55
authors = [
66
"Chris Lombardi <mojomonger@archive.org>",

src/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@
2525
# # new stuff aug 2025
2626
# from src.views.v2.refs_lookup_v2 import GetArchiveV2
2727

28-
# # new stuff oct 2025
29-
# from src.views.v2.insights_tarb_v2 import InsightsTarbV2
28+
# new stuff oct 2025
29+
from src.views.v2.insights_wiki_v2 import InsightsWikiV2
30+
from src.views.v2.insights_tarb_v2 import InsightsTarbV2
3031
# new stuff jun 2025
3132
from src.views.v2.refs_lookup_v2 import RefsLookupV2
3233
# new stuff mar 2025
@@ -145,7 +146,8 @@ def favicon():
145146
# bigger picture
146147
api.add_resource(RefsLookupV2, "/refs_lookup") # James' Wiki Citations Database
147148
api.add_resource(InsightsWebRxV2, "/insights") # Stephen's numbers
148-
# api.add_resource(InsightsTarbV2, "/tarb_insights") # Sawood's numbers
149+
api.add_resource(InsightsTarbV2, "/tarb_insights") # Sawood's numbers
150+
api.add_resource(InsightsWikiV2, "/wiki_insights") # Wiki numbers (from stephen's wiki endpoint)
149151

150152
# Other
151153
api.add_resource(ArticleCacheV2, "/article_cache") # for offline article fetching

src/helpers/wiki_utils.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# probe_utils.py
2+
import requests
3+
from typing import Optional, Union
4+
5+
from src.helpers.cache_utils import get_cache, set_cache, is_cached, CacheType
6+
7+
from src.constants.constants import ProbeMethod
8+
from src.models.v2.probes.probe_test import ProbeTest
9+
from src.models.v2.probes.probe_trust_project import ProbeTrustProject
10+
from src.models.v2.probes.probe_verifyi import ProbeVerifyi
11+
12+
13+
class WikiUtils:
14+
15+
@staticmethod
16+
def run_wiki_logic(data):
17+
# complex logic here
18+
return {"result": "ok"}
19+
20+
21+
22+
@staticmethod
23+
def get_exturls():
24+
"""
25+
returns wiki stats for external urls
26+
returns {
27+
stuff, stuff, stuff
28+
}
29+
30+
"""
31+
from src import app
32+
app.logger.debug(f"==> get_exturls_data")
33+
34+
results = {}
35+
from_cache = False
36+
37+
# fetch external url data
38+
results = WikiUtils.get_exturls_data()
39+
40+
# set cached data...but not for now
41+
42+
return {
43+
"external_urls": results,
44+
}
45+
46+
47+
def get_exturls_data():
48+
49+
results = {}
50+
51+
user_agent = "IARI, see https://github.com/internetarchive/iari"
52+
headers = {
53+
# "Content-Type": "application/x-www-form-urlencoded",
54+
"User-Agent": user_agent
55+
}
56+
57+
# do "assess" endpoint
58+
exturls_api_url = 'https://commons.wikimedia.org/w/index.php?title=Data%3AWikipedia%5Fstatistics%2Fexturls%2Etab&action=raw'
59+
60+
# TODO do we need to clean url param here?
61+
response = requests.get(
62+
exturls_api_url,
63+
headers=headers,
64+
# json={'url': url}
65+
)
66+
67+
if response.status_code == 200:
68+
data = response.json()
69+
# results['raw'] = data
70+
results = data
71+
72+
else:
73+
# append error to errors array
74+
msg = (
75+
f"Error fetching Wiki external urls data"
76+
f" Got {response.status_code} from {exturls_api_url}"
77+
f" Text: {response.text}"
78+
)
79+
80+
from src import app
81+
app.logger.debug(msg)
82+
83+
results.setdefault('errors', []).append(msg) # create errors entry if not there and append msg
84+
85+
return results
86+
87+
88+
if __name__ == "__main__":
89+
# Example command line test
90+
wiki = WikiUtils()
91+
test_url = "https://example.com"
92+
result = wiki.get_exturls(test_url)
93+
print(f"Test result for {test_url}:")
94+
print(result)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from src import MissingInformationError
2+
from src.models.v2.job import JobV2
3+
4+
5+
class InsightsTarbJobV2(JobV2):
6+
"""job that supports InsightsWebRxV2 endpoint"""
7+
8+
# using marshmallow to describe parameters
9+
10+
date_start: str = ""
11+
date_end: str = ""
12+
13+
14+
def validate_fields(self):
15+
"""
16+
parameter checking here, if any
17+
"""
18+
19+
pass
20+
21+
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from src import MissingInformationError
2+
from src.models.v2.job import JobV2
3+
4+
5+
class InsightsWikiJobV2(JobV2):
6+
"""job that supports InsightsWebRxV2 endpoint"""
7+
8+
# using marshmallow to describe parameters
9+
10+
date_start: str = ""
11+
date_end: str = ""
12+
13+
14+
def validate_fields(self):
15+
"""
16+
parameter checking here, if any
17+
"""
18+
19+
pass
20+
21+
22+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from marshmallow import fields, pre_load, post_load
2+
3+
from src.models.v2.job.insights_tarb_job_v2 import InsightsTarbJobV2
4+
from src.models.v2.schema import BaseSchemaV2
5+
6+
7+
class InsightsTarbSchemaV2(BaseSchemaV2):
8+
# Defines expected parameters for endpoint "insights"
9+
# - default parameters are defined in BaseSchemaV2
10+
11+
date_start = fields.Str(default=None, required=False)
12+
date_end = fields.Str(default=None, required=False)
13+
14+
# noinspection PyUnusedLocal
15+
@post_load
16+
# NB: post_load is a marshmallow directive;
17+
# this function is run after loading request args
18+
# it basically pulls the request object value into a Job object
19+
#
20+
# **kwargs is needed here despite what the validator claims
21+
def return_job_object(self, data, **kwargs) -> InsightsTarbJobV2: # type: ignore # dead: disable
22+
"""Return Job object"""
23+
job = InsightsTarbJobV2(**data)
24+
job.validate_fields()
25+
26+
# NB here is where we can modify job field values before returning if we want
27+
28+
return job
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from marshmallow import fields, pre_load, post_load
2+
3+
from src.models.v2.job.insights_wiki_job_v2 import InsightsWikiJobV2
4+
from src.models.v2.schema import BaseSchemaV2
5+
6+
7+
class InsightsWikiSchemaV2(BaseSchemaV2):
8+
# Defines expected parameters for endpoint "insights"
9+
# - default parameters are defined in BaseSchemaV2
10+
11+
date_start = fields.Str(default=None, required=False)
12+
date_end = fields.Str(default=None, required=False)
13+
14+
# noinspection PyUnusedLocal
15+
@post_load
16+
# NB: post_load is a marshmallow directive;
17+
# this function is run after loading request args
18+
# it basically pulls the request object value into a Job object
19+
#
20+
# **kwargs is needed here despite what the validator claims
21+
def return_job_object(self, data, **kwargs) -> InsightsWikiJobV2: # type: ignore # dead: disable
22+
"""Return Job object"""
23+
from src import app
24+
app.logger.debug("==> InsightsWikiJobV2::@post_load:return_job_object")
25+
app.logger.debug(f"return_job_object data: {data}")
26+
27+
job = InsightsWikiJobV2(**data)
28+
job.validate_fields()
29+
30+
# NB here is where we can modify job field values before returning if we want
31+
32+
return job

src/views/v2/insights_tarb_v2.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from typing import Any, Optional, Tuple, List, Dict
2+
import traceback
3+
import time
4+
5+
import requests
6+
from bs4 import BeautifulSoup, NavigableString
7+
from flask import request
8+
9+
from src.helpers.get_version import get_poetry_version
10+
11+
from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
12+
from src.models.wikimedia.enums import RequestMethods
13+
from src.views.v2.statistics import StatisticsViewV2
14+
15+
from src.models.v2.job.insights_tarb_job_v2 import InsightsTarbJobV2
16+
from src.models.v2.schema.insights_tarb_schema_v2 import InsightsTarbSchemaV2
17+
18+
19+
class InsightsTarbV2(StatisticsViewV2):
20+
21+
"""
22+
returns IABot statistical data
23+
"""
24+
25+
schema = InsightsTarbSchemaV2() # Defines expected parameters; Overrides StatisticsViewV2's "schema" property
26+
job: InsightsTarbJobV2 # Holds usable variables, seeded from schema. Overrides StatisticsViewV2's "job"
27+
28+
return_data: Dict[str, Any] = {} # holds parsed data from data processing
29+
execution_errors: List[Dict[str, Any]] = None
30+
31+
def get(self):
32+
"""
33+
flask entrypoint for GET
34+
must return a tuple: (Any, response_code)
35+
"""
36+
from src import app
37+
app.logger.debug(f"==> InsightsTarbV2::get")
38+
39+
return self.__process_request__(method=RequestMethods.get)
40+
41+
42+
def __process_request__(self, method=RequestMethods.post): # default to POST method
43+
44+
from src import app
45+
app.logger.debug(f"==> InsightsTarbV2::__process_request__, method = {method}")
46+
47+
# Start the timer
48+
start_time = time.time()
49+
50+
# fetch the insight data
51+
try:
52+
53+
# validate and setup params
54+
self.__validate_and_get_job__(method) # inherited/subclassed from StatisticsViewV2
55+
56+
# fetch the data, parse and return summary
57+
insight_data = self.__get_insight_data__()
58+
59+
# Stop the timer and calculate execution time
60+
end_time = time.time()
61+
execution_time = end_time - start_time
62+
63+
self.return_data = {
64+
"iari_version": get_poetry_version("pyproject.toml"),
65+
"iari_command": "tarb_insights",
66+
"endpoint": request.endpoint,
67+
"execution_time": f"{execution_time:.4f} seconds",
68+
"execution_errors": self.execution_errors,
69+
}
70+
71+
self.return_data.update(insight_data)
72+
73+
return self.return_data, 200
74+
75+
76+
except MissingInformationError as e:
77+
traceback.print_exc()
78+
return {"error": f"Missing Information Error: {str(e)}"}, 500
79+
80+
except Exception as e:
81+
traceback.print_exc()
82+
return {"error": f"General Error: {str(e)}"}, 500
83+
84+
85+
def __get_insight_data__(self):
86+
"""
87+
grabs appropriate data regarding media updates
88+
"""
89+
90+
# soup = self.__get_stats_soup__()
91+
#
92+
# table_names = self.__get_table_names__(soup)
93+
# table_list = self.__get_all_tables__(soup, table_names)
94+
# table_totals = self.__get_table_totals__(table_list)
95+
#
96+
# return {
97+
# "table_names": table_names,
98+
# "table_totals": table_totals,
99+
# "tables": table_list
100+
# }
101+
102+
103+
"""
104+
format of each returned record:
105+
106+
{
107+
"Wiki": "afwiki",
108+
"Timestamp": "2021-08-14 00:00:00",
109+
"TotalEdits": 2709,
110+
"TotalLinks": 3543,
111+
"ReactiveEdits": 1620,
112+
"ProactiveEdits": 90,
113+
"DeadEdits": 278,
114+
"UnknownEdits": 721,
115+
"LiveLinks": 163,
116+
"DeadLinks": 2640,
117+
"TagLinks": 740,
118+
"UnknownLinks": 0
119+
},
120+
"""
121+
122+
tarb_api_url = "https://iabot.wmcloud.org/api.php?action=statistics&format=flat"
123+
124+
r = requests.get(url_stats_yearly)
125+
# TODO use a try/catch here
126+
127+
if r.status_code != 200:
128+
return None
129+
130+
return {
131+
"test_value_1": 1,
132+
"test_value_2": 2,
133+
"test_value_3": 3,
134+
}
135+
136+

0 commit comments

Comments
 (0)