Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,16 @@ Running the command again after adding more archives or providers will automatic

For each [source pair](#build-source-pairs), we now fetch captures from the archive service that corresponds to the provider's domain and URL prefix given in the source pair. Again, rerunning the command after adding more source pairs fetches just the missing captures.

```shell
aql captures fetch
```

#### Parse SERP URLs

Not every capture necessarily points to a search engine result page (SERP). But usually, SERPs contain the user query in the URL, so we can filter out non-SERP captures by parsing the URLs.

```shell
aql serps parse url-query

```

Parsing the query from the capture URL will add SERPs to a new, more focused index that only contains SERPs. From the SERPs, we can also parse the page number and offset of the SERP, if available.
Expand Down Expand Up @@ -309,13 +312,13 @@ A pointer to the WARC block in S3 is stored in the SERP index so that we can eff
From the WARC contents, we can now parse the query as it appears on the SERP (which can sometimes differ from the query encoded in the URL).

```shell
aql serps parse serp-query
aql serps parse warc-query
```

More importantly, we can parse the snippets of the SERP.

```shell
aql serps parse serp-snippets
aql serps parse warc-snippets
```

Parsing the snippets from the SERP's WARC contents will also add the SERP's results to a new index.
Expand Down
50 changes: 43 additions & 7 deletions archive_query_log/parsers/xml.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from shutil import copyfileobj
from tempfile import TemporaryFile
from typing import Literal, Type, TypeVar, Iterable
from warnings import warn

from cssselect import GenericTranslator
from cssselect.parser import parse as cssselect_parse
from lxml.etree import parse as etree_parse, XMLParser, HTMLParser # nosec: B410
# noinspection PyProtectedMember
from lxml.etree import _ElementTree, _Element # nosec: B410
from resiliparse.parse import detect_encoding
from warcio.recordloader import ArcWarcRecord

XmlParserType = Literal[
Expand All @@ -17,21 +19,55 @@
def parse_xml_tree(record: ArcWarcRecord) -> _ElementTree | None:
mime_type: str | None = record.http_headers.get_header("Content-Type")
if mime_type is None:
warn(UserWarning("No MIME type given."))
warn("No MIME type given.", UserWarning)
return None
mime_type = mime_type.split(";", maxsplit=1)[0]

parser: XMLParser | HTMLParser
if mime_type == "text/xml":
parser = XMLParser()
elif mime_type == "text/html":
parser = HTMLParser()
else:
warn(UserWarning(f"Cannot find XML parser for MIME type: {mime_type}"))
warn(f"Cannot find XML parser for MIME type: {mime_type}", UserWarning)
return None
return etree_parse( # nosec: B320
source=record.content_stream(),
parser=parser,
)

wayback_url = record.rec_headers.get_header("WARC-Target-URI")

with TemporaryFile() as tmp_file:
# Copy the content stream to a temporary file.
# This is necessary because the content stream is not seekable.
try:
copyfileobj(record.content_stream(), tmp_file)
except AttributeError as e:
if e.name == "unused_data":
warn(f"Brotli decompression error: {wayback_url}", UserWarning)
return None
tmp_file.seek(0)

# Detect encoding using Resiliparse, based on the first 2048 bytes .
encoding: str = detect_encoding(tmp_file.peek(2048), from_html_meta=True)

# Decode the first 100 characters to check for XML/HTML content.
# Note: 2048 bytes should be enough to decode the first 100 characters.
try:
head = tmp_file.peek(2048).decode(encoding)[:100]
except UnicodeDecodeError:
warn(f"Decoding error: {wayback_url}", UserWarning)
return None

if "<" not in head:
warn(f"Skipping non-XML document: {wayback_url}", UserWarning)
return None

if head[0] in ["{", "[", '"']:
warn(f"Skipping JSON-like document: {wayback_url}", UserWarning)
return None

return etree_parse( # nosec: B320
source=tmp_file,
parser=parser,
)


_T = TypeVar("_T")
Expand Down