|
| 1 | +use crate::errors::OfflineRetrievalError; |
| 2 | +use anyhow::{anyhow, Result}; |
| 3 | +use chrono::prelude::*; |
| 4 | +use oxigraph::io::RdfFormat; |
| 5 | +use reqwest::blocking::Client; |
| 6 | +use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, CONTENT_TYPE, LINK}; |
| 7 | +use std::time::Duration; |
| 8 | + |
| 9 | +#[derive(Debug, Clone)] |
| 10 | +pub struct FetchOptions { |
| 11 | + pub offline: bool, |
| 12 | + pub timeout: Duration, |
| 13 | + pub accept_order: Vec<&'static str>, |
| 14 | + pub extension_candidates: Vec<&'static str>, |
| 15 | +} |
| 16 | + |
| 17 | +impl Default for FetchOptions { |
| 18 | + fn default() -> Self { |
| 19 | + Self { |
| 20 | + offline: false, |
| 21 | + timeout: Duration::from_secs(30), |
| 22 | + accept_order: vec![ |
| 23 | + "text/turtle", |
| 24 | + "application/rdf+xml", |
| 25 | + "application/n-triples", |
| 26 | + ], |
| 27 | + extension_candidates: vec![".ttl", ".rdf", ".owl", "index.ttl", "index.rdf"], |
| 28 | + } |
| 29 | + } |
| 30 | +} |
| 31 | + |
| 32 | +#[derive(Debug, Clone)] |
| 33 | +pub struct FetchResult { |
| 34 | + pub bytes: Vec<u8>, |
| 35 | + pub format: Option<RdfFormat>, |
| 36 | + pub final_url: String, |
| 37 | + pub content_type: Option<String>, |
| 38 | +} |
| 39 | + |
| 40 | +fn detect_format(ct: &str) -> Option<RdfFormat> { |
| 41 | + let ct = ct.split(';').next().unwrap_or("").trim().to_ascii_lowercase(); |
| 42 | + match ct.as_str() { |
| 43 | + "text/turtle" | "application/x-turtle" => Some(RdfFormat::Turtle), |
| 44 | + "application/rdf+xml" => Some(RdfFormat::RdfXml), |
| 45 | + "application/n-triples" | "application/ntriples" | "text/plain" => Some(RdfFormat::NTriples), |
| 46 | + _ => None, |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +fn build_accept(accept_order: &[&'static str]) -> String { |
| 51 | + if accept_order.is_empty() { |
| 52 | + return "*/*".to_string(); |
| 53 | + } |
| 54 | + let mut parts = Vec::new(); |
| 55 | + let mut q = 1.0f32; |
| 56 | + for (i, t) in accept_order.iter().enumerate() { |
| 57 | + parts.push(format!("{t}; q={:.1}", q)); |
| 58 | + let next = 1.0f32 - 0.1f32 * (i as f32 + 1.0f32); |
| 59 | + q = if next < 0.1 { 0.1 } else { next }; |
| 60 | + } |
| 61 | + parts.push("*/*; q=0.1".to_string()); |
| 62 | + parts.join(", ") |
| 63 | +} |
| 64 | + |
| 65 | +fn build_extension_candidates(orig: &str, exts: &[&str]) -> Vec<String> { |
| 66 | + let mut cands = Vec::new(); |
| 67 | + if orig.ends_with('/') { |
| 68 | + for e in exts { |
| 69 | + cands.push(format!("{orig}{e}")); |
| 70 | + } |
| 71 | + return cands; |
| 72 | + } |
| 73 | + // split path |
| 74 | + let slash_pos = orig.rfind('/').map(|i| i + 1).unwrap_or(0); |
| 75 | + let (prefix, filename) = orig.split_at(slash_pos); |
| 76 | + if let Some(dot) = filename.rfind('.') { |
| 77 | + let stem = &filename[..dot]; |
| 78 | + let base = format!("{prefix}{stem}"); |
| 79 | + for rep in [".ttl", ".rdf", ".owl"] { |
| 80 | + cands.push(format!("{base}{rep}")); |
| 81 | + } |
| 82 | + } else { |
| 83 | + for rep in [".ttl", ".rdf", ".owl"] { |
| 84 | + cands.push(format!("{orig}{rep}")); |
| 85 | + } |
| 86 | + } |
| 87 | + cands |
| 88 | +} |
| 89 | + |
| 90 | +fn parse_link_alternates(headers: &HeaderMap, accept_order: &[&'static str]) -> Vec<String> { |
| 91 | + let mut out = Vec::new(); |
| 92 | + if let Some(link_val) = headers.get(LINK) { |
| 93 | + if let Ok(link_str) = link_val.to_str() { |
| 94 | + for part in link_str.split(',') { |
| 95 | + let part = part.trim(); |
| 96 | + if !part.contains("rel=\"alternate\"") { |
| 97 | + continue; |
| 98 | + } |
| 99 | + // Try to extract type and URL |
| 100 | + let has_rdf_type = accept_order |
| 101 | + .iter() |
| 102 | + .any(|typ| part.contains(&format!("type=\"{}\"", typ))); |
| 103 | + if !has_rdf_type { |
| 104 | + continue; |
| 105 | + } |
| 106 | + if let Some(start) = part.find('<') { |
| 107 | + if let Some(end) = part[start + 1..].find('>') { |
| 108 | + let url = &part[start + 1..start + 1 + end]; |
| 109 | + out.push(url.to_string()); |
| 110 | + } |
| 111 | + } |
| 112 | + } |
| 113 | + } |
| 114 | + } |
| 115 | + out |
| 116 | +} |
| 117 | + |
| 118 | +fn try_get( |
| 119 | + url: &str, |
| 120 | + client: &Client, |
| 121 | + accept: &str, |
| 122 | +) -> Result<(Vec<u8>, Option<String>, Option<String>, String, reqwest::StatusCode)> { |
| 123 | + let resp = client.get(url).header(ACCEPT, accept).send()?; |
| 124 | + let status = resp.status(); |
| 125 | + let final_url = resp.url().to_string(); |
| 126 | + let ct = resp |
| 127 | + .headers() |
| 128 | + .get(CONTENT_TYPE) |
| 129 | + .and_then(|h| h.to_str().ok()) |
| 130 | + .map(|s| s.to_string()); |
| 131 | + let link = resp |
| 132 | + .headers() |
| 133 | + .get(LINK) |
| 134 | + .and_then(|h| h.to_str().ok()) |
| 135 | + .map(|s| s.to_string()); |
| 136 | + let bytes = resp.bytes()?.to_vec(); |
| 137 | + Ok((bytes, ct, link, final_url, status)) |
| 138 | +} |
| 139 | + |
| 140 | +pub fn fetch_rdf(url: &str, opts: &FetchOptions) -> Result<FetchResult> { |
| 141 | + if opts.offline { |
| 142 | + return Err(anyhow!(OfflineRetrievalError { |
| 143 | + file: url.to_string() |
| 144 | + })); |
| 145 | + } |
| 146 | + let client = Client::builder().timeout(opts.timeout).build()?; |
| 147 | + let accept = build_accept(&opts.accept_order); |
| 148 | + |
| 149 | + // First attempt |
| 150 | + let (bytes, ct, link, final_url, status) = try_get(url, &client, &accept)?; |
| 151 | + |
| 152 | + // If success and looks RDF by Content-Type, return |
| 153 | + if status.is_success() { |
| 154 | + if let Some(ref cts) = ct { |
| 155 | + if let Some(fmt) = detect_format(cts) { |
| 156 | + return Ok(FetchResult { |
| 157 | + bytes, |
| 158 | + format: Some(fmt), |
| 159 | + final_url, |
| 160 | + content_type: ct, |
| 161 | + }); |
| 162 | + } |
| 163 | + } |
| 164 | + // Unknown or HTML content-type: fall through to alternates with Link header hints |
| 165 | + } |
| 166 | + |
| 167 | + // Try Link: rel="alternate" first if present |
| 168 | + if let Some(link_header) = link { |
| 169 | + let mut headers = HeaderMap::new(); |
| 170 | + headers.insert( |
| 171 | + LINK, |
| 172 | + HeaderValue::from_str(&link_header).unwrap_or(HeaderValue::from_static("")), |
| 173 | + ); |
| 174 | + for alt in parse_link_alternates(&headers, &opts.accept_order) { |
| 175 | + let (b2, ct2, _link2, fu2, st2) = try_get(&alt, &client, &accept)?; |
| 176 | + if st2.is_success() { |
| 177 | + let fmt = ct2.as_deref().and_then(detect_format); |
| 178 | + return Ok(FetchResult { |
| 179 | + bytes: b2, |
| 180 | + format: fmt, |
| 181 | + final_url: fu2, |
| 182 | + content_type: ct2, |
| 183 | + }); |
| 184 | + } |
| 185 | + } |
| 186 | + } |
| 187 | + |
| 188 | + // Status-based or type-based fallbacks |
| 189 | + if !status.is_success() || ct.as_deref().map(|s| s.contains("html")).unwrap_or(true) { |
| 190 | + for candidate in build_extension_candidates(url, &opts.extension_candidates) { |
| 191 | + let (b2, ct2, _link2, fu2, st2) = try_get(&candidate, &client, &accept)?; |
| 192 | + if st2.is_success() { |
| 193 | + let fmt = ct2.as_deref().and_then(detect_format); |
| 194 | + return Ok(FetchResult { |
| 195 | + bytes: b2, |
| 196 | + format: fmt, |
| 197 | + final_url: fu2, |
| 198 | + content_type: ct2, |
| 199 | + }); |
| 200 | + } |
| 201 | + } |
| 202 | + } |
| 203 | + |
| 204 | + // As a last resort, if the original was successful but with unknown CT, return it. |
| 205 | + if status.is_success() { |
| 206 | + let fmt = ct.as_deref().and_then(detect_format); |
| 207 | + return Ok(FetchResult { |
| 208 | + bytes, |
| 209 | + format: fmt, |
| 210 | + final_url, |
| 211 | + content_type: ct, |
| 212 | + }); |
| 213 | + } |
| 214 | + |
| 215 | + Err(anyhow!( |
| 216 | + "Failed to retrieve RDF from {} (HTTP {}) and fallbacks", |
| 217 | + url, |
| 218 | + status |
| 219 | + )) |
| 220 | +} |
| 221 | + |
| 222 | +pub fn head_last_modified(url: &str, opts: &FetchOptions) -> Result<Option<DateTime<Utc>>> { |
| 223 | + if opts.offline { |
| 224 | + return Err(anyhow!(OfflineRetrievalError { |
| 225 | + file: url.to_string() |
| 226 | + })); |
| 227 | + } |
| 228 | + let client = Client::builder().timeout(opts.timeout).build()?; |
| 229 | + let accept = build_accept(&opts.accept_order); |
| 230 | + let resp = client.head(url).header(ACCEPT, accept).send()?; |
| 231 | + if !resp.status().is_success() { |
| 232 | + return Ok(None); |
| 233 | + } |
| 234 | + if let Some(h) = resp.headers().get("Last-Modified") { |
| 235 | + if let Ok(s) = h.to_str() { |
| 236 | + if let Ok(dt) = DateTime::parse_from_rfc2822(s) { |
| 237 | + return Ok(Some(dt.with_timezone(&Utc))); |
| 238 | + } |
| 239 | + } |
| 240 | + } |
| 241 | + Ok(None) |
| 242 | +} |
| 243 | + |
| 244 | +pub fn head_exists(url: &str, opts: &FetchOptions) -> Result<bool> { |
| 245 | + if opts.offline { |
| 246 | + return Err(anyhow!(OfflineRetrievalError { |
| 247 | + file: url.to_string() |
| 248 | + })); |
| 249 | + } |
| 250 | + let client = Client::builder().timeout(opts.timeout).build()?; |
| 251 | + let accept = build_accept(&opts.accept_order); |
| 252 | + let resp = client.head(url).header(ACCEPT, accept).send()?; |
| 253 | + Ok(resp.status().is_success()) |
| 254 | +} |
0 commit comments