Skip to content

Commit 38a79e4

Browse files
committed
Create robots.txt file for the website
Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
1 parent 5933349 commit 38a79e4

6 files changed

Lines changed: 126 additions & 1 deletion

File tree

backend/src/api.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
JSONResponse,
1919
FileResponse,
2020
RedirectResponse,
21+
PlainTextResponse,
2122
)
2223
from fastapi.staticfiles import StaticFiles
2324
from fastapi.templating import Jinja2Templates
@@ -62,6 +63,7 @@
6263
write_json_file,
6364
fetch_text,
6465
sanitize_uploaded_schema,
66+
get_robots,
6567
)
6668
from src.store import Storator3000
6769
from src.exceptions import NoDataFound
@@ -634,3 +636,9 @@ def get_report_stats() -> dict:
634636
LOGGER.info("Retrieving annotation statistics")
635637

636638
return Storator3000.get_stats()
639+
640+
641+
@app.get("/robots.txt", include_in_schema=False, response_class=PlainTextResponse)
642+
def robots() -> str:
643+
"""Return robots.txt"""
644+
return get_robots()

backend/src/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,10 @@ class BuildIdTitleEnum(StrEnum):
4747
LOGGER_NAME = "logdetective_website"
4848

4949
STATIC_SOURCE_DIR = Path(__file__).parent.parent.parent / "frontend" / "public"
50+
51+
DEFAULT_ROBOTS = """
52+
User-Agent: *
53+
DisallowAITraining: /
54+
Content-Usage: ai=n
55+
Allow: /
56+
"""

backend/src/spells.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import tarfile
1010
import tempfile
1111
from contextlib import contextmanager
12+
from functools import lru_cache
1213
from pathlib import Path
1314
from typing import Any, Iterator, Optional
1415

@@ -30,6 +31,7 @@
3031
html_careful_unescape,
3132
log_schema_redaction,
3233
)
34+
from src.constants import DEFAULT_ROBOTS, STATIC_SOURCE_DIR
3335

3436

3537
@contextmanager
@@ -269,3 +271,16 @@ def sanitize_uploaded_schema(input_schema: FeedbackSchema) -> FeedbackSchema:
269271
)
270272

271273
return result
274+
275+
276+
@lru_cache(maxsize=1)
277+
def get_robots() -> str:
278+
"""Return contents robots.txt, if the file doesn't exist return built-in default."""
279+
280+
try:
281+
with open(
282+
os.path.join(STATIC_SOURCE_DIR, "robots.txt"), encoding="utf-8"
283+
) as robots:
284+
return robots.read()
285+
except FileNotFoundError:
286+
return DEFAULT_ROBOTS

backend/tests/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,12 @@ def container_feedback_input_output_schema_tuple():
217217
},
218218
),
219219
)
220+
221+
222+
@pytest.fixture
223+
def clear_robots_cache():
224+
"""Clears cache of get_robots function"""
225+
from src.spells import get_robots # pylint: disable=import-outside-toplevel
226+
227+
get_robots.cache_clear()
228+
yield

backend/tests/unit/test_spells.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
from unittest.mock import patch
12
import responses
2-
3+
from src.constants import DEFAULT_ROBOTS
34
from src.spells import (
45
ensure_text,
56
fetch_text,
67
read_json_file,
78
read_text_file,
89
write_json_file,
10+
get_robots,
911
)
1012

1113

@@ -88,3 +90,20 @@ def test_reads_utf8_content(self, tmp_path):
8890
result = read_text_file(file_path)
8991

9092
assert result == content
93+
94+
95+
class TestGetRobots:
96+
"""Tests for robots.txt retrieval function"""
97+
98+
def test_file_retrieval(self, clear_robots_cache): # pylint: disable=unused-argument
99+
"""Test that robots.txt file is properly returned"""
100+
robots_txt = get_robots()
101+
assert isinstance(robots_txt, str)
102+
assert robots_txt != DEFAULT_ROBOTS
103+
104+
@patch("src.spells.STATIC_SOURCE_DIR", "this/path/does/not/exist")
105+
def test_default_fallback(self, clear_robots_cache): # pylint: disable=unused-argument
106+
"""Test that default fallback is used if file is not accessible"""
107+
robots_txt = get_robots()
108+
assert isinstance(robots_txt, str)
109+
assert robots_txt == DEFAULT_ROBOTS

frontend/public/robots.txt

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Well known AI agents and bots are allowed to access the website
2+
# with exception of the /explain, /contribute and /review paths
3+
4+
User-Agent: GPTBot
5+
User-Agent: ClaudeBot
6+
User-Agent: Claude-User
7+
User-Agent: Claude-SearchBot
8+
User-Agent: CCBot
9+
User-Agent: Google-Extended
10+
User-Agent: Applebot-Extended
11+
User-Agent: Facebookbot
12+
User-Agent: Meta-ExternalAgent
13+
User-Agent: Meta-ExternalFetcher
14+
User-Agent: diffbot
15+
User-Agent: PerplexityBot
16+
User-Agent: PerplexityUser
17+
User-Agent: Omgili
18+
User-Agent: Omgilibot
19+
User-Agent: webzio-extended
20+
User-Agent: ImagesiftBot
21+
User-Agent: Bytespider
22+
User-Agent: TikTokSpider
23+
User-Agent: Amazonbot
24+
User-Agent: Youbot
25+
User-Agent: SemrushBot-OCOB
26+
User-Agent: Petalbot
27+
User-Agent: VelenPublicWebCrawler
28+
User-Agent: TurnitinBot
29+
User-Agent: Timpibot
30+
User-Agent: OAI-SearchBot
31+
User-Agent: ICC-Crawler
32+
User-Agent: AI2Bot
33+
User-Agent: AI2Bot-Dolma
34+
User-Agent: DataForSeoBot
35+
User-Agent: AwarioBot
36+
User-Agent: AwarioSmartBot
37+
User-Agent: AwarioRssBot
38+
User-Agent: Google-CloudVertexBot
39+
User-Agent: PanguBot
40+
User-Agent: Kangaroo Bot
41+
User-Agent: Sentibot
42+
User-Agent: img2dataset
43+
User-Agent: Meltwater
44+
User-Agent: Seekr
45+
User-Agent: peer39_crawler
46+
User-Agent: cohere-ai
47+
User-Agent: cohere-training-data-crawler
48+
User-Agent: DuckAssistBot
49+
User-Agent: Scrapy
50+
User-Agent: Cotoyogi
51+
User-Agent: aiHitBot
52+
User-Agent: Factset_spyderbot
53+
User-Agent: FirecrawlAgent
54+
55+
Disallow: /review
56+
Disallow: /contribute
57+
Disallow: /explain
58+
DisallowAITraining: /review
59+
DisallowAITraining: /contribute
60+
DisallowAITraining: /explain
61+
62+
# All other bots are blocked from training on the website
63+
# but allowed to access it for general use
64+
User-Agent: *
65+
DisallowAITraining: /
66+
Content-Usage: ai=n
67+
Allow: /

0 commit comments

Comments
 (0)