knowledge-api-client-examples/examples/file-management/python/resolve_file_links.py at main · Serviceware/knowledge-api-client-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
Fetch a knowledge text, extract every data-sabio-file-id reference from its
HTML content, and resolve each one to FM metadata + pre-signed URL.

Requires:
    pip install requests beautifulsoup4

Usage:
    python resolve_file_links.py
"""

import sys
from typing import Any

import requests
from bs4 import BeautifulSoup

# ---------------------------------------------------------------------------
# Configuration -- replace placeholders with values for your environment.
# ---------------------------------------------------------------------------
BASE_URL = "https://<your-instance>/sabio-web/services"
USERNAME = "<username>"
PASSWORD = "<password>"

TEXT_ID = "<text-id>"

FM_FILE_ATTR = "data-sabio-file-id"


def login(base_url: str, username: str, password: str) -> str:
    """Exchange credentials for a session token."""
    response = requests.post(
        f"{base_url}/authentication/credentials",
        json={"login": username, "key": password},
        headers={"Content-Type": "application/json; charset=utf-8"},
        timeout=30,
    )
    response.raise_for_status()
    token = response.json().get("data", {}).get("key")
    if not token:
        raise RuntimeError(f"Login failed: {response.text}")
    return token


def fetch_text(base_url: str, token: str, text_id: str) -> dict[str, Any]:
    response = requests.get(
        f"{base_url}/text/{text_id}",
        params={"mode": "view"},
        headers={"sabio-auth-token": token},
        timeout=30,
    )
    response.raise_for_status()
    return response.json()


def extract_file_ids(text_payload: dict[str, Any]) -> list[str]:
    """Walk all fragments and return the unique set of data-sabio-file-id values."""
    fragments = (text_payload.get("data") or {}).get("result", {}).get("fragments", []) or []
    ids: list[str] = []
    seen: set[str] = set()
    for fragment in fragments:
        content = fragment.get("content") or ""
        if not content:
            continue
        soup = BeautifulSoup(content, "html.parser")
        for element in soup.select(f"[{FM_FILE_ATTR}]"):
            file_id = element.get(FM_FILE_ATTR)
            if file_id and file_id not in seen:
                seen.add(file_id)
                ids.append(file_id)
    return ids


def fetch_metadata(base_url: str, token: str, file_id: str) -> dict[str, Any]:
    response = requests.get(
        f"{base_url}/fm/{file_id}",
        headers={"sabio-auth-token": token},
        timeout=30,
    )
    response.raise_for_status()
    return response.json()


def fetch_urls(base_url: str, token: str, file_id: str, *, download: bool = False) -> dict[str, Any]:
    response = requests.get(
        f"{base_url}/fm/url/{file_id}",
        params={"download": "true" if download else "false"},
        headers={"sabio-auth-token": token},
        timeout=30,
    )
    response.raise_for_status()
    return response.json()


PREVIEW_STATUSES = {"done", "pending", "unsupported", "none"}


def describe_preview(preview: str | None) -> str:
    if not preview:
        return "<missing>"
    if preview in PREVIEW_STATUSES:
        return f"status={preview}"
    return f"url={preview[:80]}..."


def main() -> int:
    token = login(BASE_URL, USERNAME, PASSWORD)
    text_payload = fetch_text(BASE_URL, token, TEXT_ID)

    text = (text_payload.get("data") or {}).get("result") or {}
    print(f"Text:  {text.get('id')}  ({text.get('title')!r})")

    file_ids = extract_file_ids(text_payload)
    if not file_ids:
        print("No file references found in this text.")
        return 0

    print(f"Found {len(file_ids)} unique file reference(s):")
    for index, file_id in enumerate(file_ids, start=1):
        print(f"\n  {index}. {file_id}")
        try:
            meta = (fetch_metadata(BASE_URL, token, file_id).get("data") or {}).get("result") or {}
        except requests.HTTPError as exc:
            print(f"     metadata: HTTP {exc.response.status_code}")
            continue

        print(f"     filename:   {meta.get('filename')}")
        print(f"     mimeType:   {meta.get('mimeType')}")
        print(f"     size:       {meta.get('size')} bytes")
        print(f"     permission: {meta.get('userPermission')}  (bitfield: 1=read 2=create 4=update 8=delete)")

        try:
            urls = (fetch_urls(BASE_URL, token, file_id).get("data") or {}).get("result") or {}
        except requests.HTTPError as exc:
            print(f"     url:        HTTP {exc.response.status_code}")
            continue

        url = urls.get("url") or ""
        print(f"     url:        {url[:80]}{'...' if len(url) > 80 else ''}")
        print(f"     preview:    {describe_preview(urls.get('preview'))}")

    return 0


if __name__ == "__main__":
    sys.exit(main())