diff --git a/.gitignore b/.gitignore index 031e0e41..edc7eee8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ NOTE.md Rocket.toml *.pem **/data/ -db/ +/db/ *.db *.db-shm *.db-wal @@ -168,4 +168,5 @@ rsky-wintermute/*.md rsky-wintermute/*.json !rsky-wintermute/README.md CLAUDE.md -.claude \ No newline at end of file +.claude +docs \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index a82a28fc..59a234de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -417,6 +417,42 @@ dependencies = [ "trait-variant", ] +[[package]] +name = "atrium-api" +version = "0.25.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f182d9437cd447ed87eca75540151653e332d6753a2a4749d72c0f15aa1f179" +dependencies = [ + "atrium-common", + "atrium-xrpc", + "chrono", + "http 1.3.1", + "ipld-core", + "langtag", + "regex", + "serde", + "serde_bytes", + "serde_json", + "thiserror 1.0.69", + "tokio", + "trait-variant", +] + +[[package]] +name = "atrium-common" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eff94b4ce3e9ba11d8bda83674e75ccaca281d5251ec3816d03e6bb23583ff4f" +dependencies = [ + "dashmap 6.1.0", + "lru 0.12.5", + "moka", + "thiserror 1.0.69", + "tokio", + "trait-variant", + "web-time", +] + [[package]] name = "atrium-xrpc" version = "0.12.3" @@ -778,7 +814,7 @@ dependencies = [ "rustls-pki-types", "tokio", "tokio-rustls 0.26.2", - "tower", + "tower 0.5.2", "tracing", ] @@ -900,6 +936,74 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "axum-macros", + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "hyper 1.7.0", + "hyper-util", + "itoa 1.0.15", + "matchit", + "memchr", + "mime", + "multer", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper 1.0.2", + "tokio", + "tower 0.5.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 1.0.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d123550fa8d071b7255cb0cc04dc302baa6c8c4a79f55701552684d8399bce" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "backtrace" version = "0.3.75" @@ -5039,6 +5143,21 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "jsonwebtoken" +version = "9.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a87cc7a48537badeae96744432de36f4be2b4a34a05a5ef32e9dd8a1c169dde" +dependencies = [ + "base64 0.22.1", + "js-sys", + "pem", + "ring", + "serde", + "serde_json", + "simple_asn1", +] + [[package]] name = "jwt-simple" version = "0.12.12" @@ -5533,6 +5652,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "maybe-rayon" version = "0.1.1" @@ -6574,6 +6699,16 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "pem" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" +dependencies = [ + "base64 0.22.1", + "serde_core", +] + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -6890,6 +7025,7 @@ dependencies = [ "postgres-protocol", "serde", "serde_json", + "uuid 1.18.0", ] [[package]] @@ -7575,8 +7711,8 @@ dependencies = [ "tokio-native-tls", "tokio-rustls 0.26.2", "tokio-util", - "tower", - "tower-http", + "tower 0.5.2", + "tower-http 0.6.6", "tower-service", "url", "wasm-bindgen", @@ -7965,7 +8101,7 @@ name = "rsky-labeler" version = "0.1.3" dependencies = [ "anyhow", - "atrium-api", + "atrium-api 0.24.10", "atrium-xrpc-client", "chrono", "ciborium", @@ -8018,7 +8154,7 @@ dependencies = [ "anyhow", "argon2", "async-event-emitter", - "atrium-api", + "atrium-api 0.24.10", "atrium-xrpc-client", "aws-config", "aws-sdk-s3", @@ -8201,6 +8337,46 @@ dependencies = [ "url", ] +[[package]] +name = "rsky-video" +version = "0.1.0" +dependencies = [ + "atrium-api 0.25.7", + "atrium-xrpc", + "atrium-xrpc-client", + "axum", + "base64 0.22.1", + "bytes", + "chrono", + "cid 0.11.1", + "color-eyre", + "deadpool-postgres", + "futures", + "jsonwebtoken", + "k256", + "mockito", + "multihash-codetable", + "prometheus", + "rand 0.8.5", + "reqwest 0.12.23", + "rsky-syntax", + "rustls 0.23.31", + "sec1 0.7.3", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.16", + "tokio", + "tokio-postgres", + "tower 0.4.13", + "tower-http 0.5.2", + "tracing", + "tracing-subscriber", + "url", + "urlencoding", + "uuid 1.18.0", +] + [[package]] name = "rsky-wintermute" version = "0.1.0" @@ -8772,6 +8948,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa 1.0.15", + "serde", + "serde_core", +] + [[package]] name = "serde_qs" version = "0.12.0" @@ -9052,6 +9239,18 @@ version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" +[[package]] +name = "simple_asn1" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror 2.0.16", + "time", +] + [[package]] name = "siphasher" version = "0.3.11" @@ -10000,6 +10199,21 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d163a63c116ce562a22cda521fcc4d79152e7aba014456fb5eb442f6d6a10109" +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tower" version = "0.5.2" @@ -10013,6 +10227,24 @@ dependencies = [ "tokio", "tower-layer", "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" +dependencies = [ + "bitflags 2.9.2", + "bytes", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", ] [[package]] @@ -10028,7 +10260,7 @@ dependencies = [ "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower", + "tower 0.5.2", "tower-layer", "tower-service", ] @@ -10051,6 +10283,7 @@ version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -10468,6 +10701,7 @@ checksum = "f33196643e165781c20a5ead5582283a7dacbb87855d867fbc2df3f81eddc1be" dependencies = [ "getrandom 0.3.3", "js-sys", + "serde", "wasm-bindgen", ] diff --git a/Cargo.toml b/Cargo.toml index a030ee51..83674550 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ members = [ "rsky-repo", "rsky-satnav", "rsky-syntax", + "rsky-video", "rsky-wintermute", ] resolver = "2" diff --git a/TODO.md b/TODO.md new file mode 100644 index 00000000..dacc03a3 --- /dev/null +++ b/TODO.md @@ -0,0 +1,66 @@ +# TODO - Blacksky Community Posts + +## Completed + +- [x] Client-side CID computation before submission +- [x] Appview CID verification (expectedCid parameter) +- [x] CidMismatch error on verification failure +- [x] Rename contentHash to cid across codebase + +## Remaining Work + +### High Priority + +- [ ] **Hydration-time CID verification in client** + - Fetch stub record from user's PDS to get authoritative CID + - Fetch hydrated content from appview + - Compute CID from hydrated content + - Compare: if mismatch, warn user that content may have been tampered + - Location: `blacksky.community/src/state/queries/community-feed.ts` + +- [ ] **Firehose listener for orphaned content cleanup** + - Listen for delete events on `community.blacksky.feed.post` collection + - When stub is deleted from PDS, delete corresponding content from appview's `community_post` table + - Prevents orphaned content when user deletes stub directly via `deleteRecord` + - Location: rsky-wintermute or separate service + +### Medium Priority + +- [ ] **Community post threadgate support** + - Implement `community.blacksky.feed.threadgate` record type + - Allow post authors to restrict who can reply + - Rules: mentionRule, followerRule, followingRule, listRule + - Lexicon exists at: `lexicons/community/blacksky/feed/threadgate.json` + +- [ ] **Community feed aggregation** + - Global community feed (all members' posts) + - Filtered by engagement, recency, etc. + - New endpoint: `community.blacksky.feed.getCommunityTimeline` + +### Low Priority + +- [ ] **Stub record verification on post fetch** + - When fetching a community post, optionally verify stub exists in user's PDS + - Ensures post wasn't created by appview without user's consent + - Trade-off: adds latency, may not be necessary for all use cases + +- [ ] **Content expiration policy** + - Allow users to set expiration on community posts + - Auto-delete content after expiration while keeping stub as tombstone + - Useful for ephemeral content + +## Notes + +### On-Demand Hydration Pattern +The community posts follow the "on-demand record hydration" pattern: +1. Stub record in user's PDS: `{ createdAt, cid }` +2. Full content stored on appview +3. CID computed by client = source of truth for integrity +4. Appview hydrates stub with full content on fetch + +### Integrity Guarantee +The CID in the stub is a cryptographic commitment: +- Computed by CLIENT from canonical record +- Stored in user's PDS (user controls) +- If appview modifies content, CID won't match +- Clients can verify by recomputing CID from hydrated content diff --git a/rsky-feedgen/src/lib.rs b/rsky-feedgen/src/lib.rs index 67737a51..3ec6becb 100644 --- a/rsky-feedgen/src/lib.rs +++ b/rsky-feedgen/src/lib.rs @@ -74,6 +74,7 @@ pub struct FeedGenConfig { pub sponsored_post_uri: String, pub sponsored_post_probability: f64, pub trending_percentile_min: f64, + pub pinned_post_uri: String, } pub mod apis; diff --git a/rsky-feedgen/src/main.rs b/rsky-feedgen/src/main.rs index a772d138..3e5ef344 100644 --- a/rsky-feedgen/src/main.rs +++ b/rsky-feedgen/src/main.rs @@ -141,6 +141,7 @@ fn rocket() -> _ { Ok(percentile) => percentile, }, }, + pinned_post_uri: env::var("PINNED_POST_URI").unwrap_or_default(), }; rocket::custom(figment) diff --git a/rsky-feedgen/src/routes.rs b/rsky-feedgen/src/routes.rs index 4365681b..85345b5c 100644 --- a/rsky-feedgen/src/routes.rs +++ b/rsky-feedgen/src/routes.rs @@ -155,7 +155,7 @@ pub async fn index( Err(_) => eprintln!("Failed to write anonymous visitor."), } } - match feed { + let mut result = match feed { _blacksky if _blacksky == BLACKSKY && !is_banned => { match crate::apis::get_all_posts(None, limit, cursor, true, connection, config).await { Ok(response) => Ok(Json(response)), @@ -408,7 +408,22 @@ pub async fn index( Json(internal_error), )) } + }; + + // Insert pinned post at position 0 on first load only (no cursor = first page) + // Skip for banned users who should only see the banned notice + if cursor.is_none() && !is_banned && !config.pinned_post_uri.is_empty() { + if let Ok(ref mut response) = result { + response.feed.insert( + 0, + crate::models::PostResult { + post: config.pinned_post_uri.clone(), + }, + ); + } } + + result } #[rocket::put("/cursor?&")] diff --git a/rsky-video/Cargo.toml b/rsky-video/Cargo.toml new file mode 100644 index 00000000..7a533d0b --- /dev/null +++ b/rsky-video/Cargo.toml @@ -0,0 +1,71 @@ +[package] +name = "rsky-video" +version = "0.1.0" +edition = "2024" +authors = ["Rudy Fraser "] +description = "Blacksky video service - handles video uploads, transcoding via Bunny Stream, and playback" + +[[bin]] +name = "video-service" +path = "src/main.rs" + +[dependencies] +# Async runtime +tokio = { workspace = true } +futures = { version = "0.3", default-features = false, features = ["std"] } + +# Web framework +axum = { version = "0.7", features = ["macros", "multipart"] } +tower = { version = "0.4", features = ["util"] } +tower-http = { version = "0.5", features = ["cors", "trace"] } + +# HTTP client for Bunny API +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls-webpki-roots-no-provider", "stream"] } + +# Serialization +serde = { workspace = true } +serde_json = { workspace = true } + +# Database +tokio-postgres = { version = "0.7", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } +deadpool-postgres = "0.13" + +# Auth / crypto / TLS / encoding +jsonwebtoken = "9" +base64 = { version = "0.22", features = ["std"] } +rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs"] } +k256 = { version = "0.13", features = ["ecdsa", "pem", "pkcs8"] } +sec1 = { version = "0.7", features = ["pem"] } +rand = "0.8" + +# Utilities +chrono = { version = "0.4", default-features = false, features = ["serde", "clock"] } +uuid = { version = "1", features = ["v4", "serde"] } +thiserror = "2" +color-eyre = "0.6" +url = "2" +bytes = "1" +urlencoding = "2" + +# CID generation +cid = "0.11" +multihash-codetable = { version = "0.1", features = ["sha2"] } + +# AT Protocol client (atrium from crates.io) +atrium-api = { version = "0.25", features = ["agent"] } +atrium-xrpc = "0.12" +atrium-xrpc-client = "0.5" + +# Logging/tracing +tracing = { version = "0.1", features = ["release_max_level_debug"] } +tracing-subscriber = { version = "0.3", features = ["env-filter"] } + +# Metrics +prometheus = { version = "0.13", features = ["process"] } + +# rsky crates +rsky-syntax = { workspace = true } + +[dev-dependencies] +mockito = "1.7.0" +tempfile = "3" diff --git a/rsky-video/src/auth/mod.rs b/rsky-video/src/auth/mod.rs new file mode 100644 index 00000000..d14d3e91 --- /dev/null +++ b/rsky-video/src/auth/mod.rs @@ -0,0 +1,173 @@ +//! Service authentication handling +//! +//! Validates service auth tokens from AT Protocol clients. +//! For MVP, we do basic JWT validation. Full validation would verify +//! the signature against the PDS's signing key. + +use base64::{Engine, engine::general_purpose::URL_SAFE_NO_PAD}; +use serde::{Deserialize, Serialize}; +use tracing::{debug, warn}; + +use crate::error::{Error, Result}; + +/// Decoded service auth token claims +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct ServiceAuthClaims { + /// Issuer (the user's DID when PDS signs on user's behalf) + pub iss: String, + /// Audience (should be the video service DID) + pub aud: String, + /// Subject (the user's DID) - optional, may use iss instead + pub sub: Option, + /// Lexicon method being authorized + pub lxm: Option, + /// Expiration time (Unix timestamp) + pub exp: i64, + /// Issued at time (Unix timestamp) + pub iat: Option, + /// JWT ID + pub jti: Option, +} + +impl ServiceAuthClaims { + /// Get the user DID - uses sub if present, otherwise iss + pub fn user_did(&self) -> &str { + self.sub.as_deref().unwrap_or(&self.iss) + } +} + +/// Extract and validate the Authorization header +pub fn extract_auth_header(auth_header: Option<&str>) -> Result { + let header = auth_header + .ok_or_else(|| Error::Unauthorized("Missing Authorization header".to_string()))?; + + if !header.starts_with("Bearer ") { + return Err(Error::Unauthorized( + "Invalid Authorization header format".to_string(), + )); + } + + Ok(header[7..].to_string()) +} + +/// Decode and validate a service auth JWT (basic validation) +/// +/// For MVP, this does: +/// - Decode the JWT payload +/// - Check expiration +/// - Optionally validate audience matches expected DID +/// +/// Full implementation would also: +/// - Resolve the issuer's signing key from their PDS +/// - Verify the JWT signature +pub fn validate_service_auth( + token: &str, + expected_aud: &str, + expected_lxm: Option<&str>, +) -> Result { + // Split JWT into parts + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return Err(Error::Unauthorized("Invalid JWT format".to_string())); + } + + // Decode the payload (middle part) + let payload_bytes = URL_SAFE_NO_PAD + .decode(parts[1]) + .map_err(|e| Error::Unauthorized(format!("Failed to decode JWT payload: {}", e)))?; + + let claims: ServiceAuthClaims = serde_json::from_slice(&payload_bytes) + .map_err(|e| Error::Unauthorized(format!("Failed to parse JWT claims: {}", e)))?; + + debug!( + "Service auth: iss={}, sub={:?}, aud={}, lxm={:?}, user_did={}", + claims.iss, + claims.sub, + claims.aud, + claims.lxm, + claims.user_did() + ); + + // Check expiration + let now = chrono::Utc::now().timestamp(); + if claims.exp < now { + return Err(Error::Unauthorized("Token has expired".to_string())); + } + + // Validate audience + if claims.aud != expected_aud { + warn!( + "Invalid audience: expected {}, got {}", + expected_aud, claims.aud + ); + return Err(Error::Unauthorized("Invalid token audience".to_string())); + } + + // Validate lexicon method if expected + if let Some(expected) = expected_lxm { + if claims.lxm.as_deref() != Some(expected) { + warn!("Invalid lxm: expected {}, got {:?}", expected, claims.lxm); + return Err(Error::Unauthorized("Invalid token scope".to_string())); + } + } + + Ok(claims) +} + +/// Decode service auth JWT without audience validation +/// Used for uploadVideo where the token's audience is the user's PDS DID, +/// not the video service DID. The video service forwards this token to the PDS. +pub fn decode_service_auth(token: &str) -> Result { + // Split JWT into parts + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return Err(Error::Unauthorized("Invalid JWT format".to_string())); + } + + // Decode the payload (middle part) + let payload_bytes = URL_SAFE_NO_PAD + .decode(parts[1]) + .map_err(|e| Error::Unauthorized(format!("Failed to decode JWT payload: {}", e)))?; + + let claims: ServiceAuthClaims = serde_json::from_slice(&payload_bytes) + .map_err(|e| Error::Unauthorized(format!("Failed to parse JWT claims: {}", e)))?; + + debug!( + "Service auth (no aud check): iss={}, sub={:?}, aud={}, lxm={:?}, user_did={}", + claims.iss, + claims.sub, + claims.aud, + claims.lxm, + claims.user_did() + ); + + // Check expiration + let now = chrono::Utc::now().timestamp(); + if claims.exp < now { + return Err(Error::Unauthorized("Token has expired".to_string())); + } + + Ok(claims) +} + +/// Extract the user DID from an Authorization header +pub fn get_user_did(auth_header: Option<&str>, service_did: &str) -> Result { + let token = extract_auth_header(auth_header)?; + let claims = validate_service_auth(&token, service_did, None)?; + Ok(claims.user_did().to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_auth_header() { + assert!(extract_auth_header(None).is_err()); + assert!(extract_auth_header(Some("Basic xyz")).is_err()); + assert_eq!( + extract_auth_header(Some("Bearer mytoken")).unwrap(), + "mytoken" + ); + } +} diff --git a/rsky-video/src/bunny/mod.rs b/rsky-video/src/bunny/mod.rs new file mode 100644 index 00000000..223772ea --- /dev/null +++ b/rsky-video/src/bunny/mod.rs @@ -0,0 +1,201 @@ +//! Bunny Stream API client + +mod types; + +pub use types::*; + +use crate::error::{Error, Result}; +use bytes::Bytes; +use tracing::{debug, info}; + +const BUNNY_API_BASE: &str = "https://video.bunnycdn.com"; + +/// Client for interacting with Bunny Stream API +#[derive(Debug, Clone)] +pub struct BunnyClient { + library_id: String, + api_key: String, + pull_zone: String, + client: reqwest::Client, +} + +impl BunnyClient { + pub fn new(library_id: String, api_key: String, pull_zone: String) -> Self { + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(300)) + .build() + .expect("Failed to create HTTP client"); + + Self { + library_id, + api_key, + pull_zone, + client, + } + } + + /// Create a new video object in Bunny Stream + /// Returns the video GUID that can be used for uploading + pub async fn create_video(&self, title: &str) -> Result { + let url = format!("{}/library/{}/videos", BUNNY_API_BASE, self.library_id); + + debug!("Creating video in Bunny: {}", title); + + let response = self + .client + .post(&url) + .header("AccessKey", &self.api_key) + .header("Content-Type", "application/json") + .json(&serde_json::json!({ + "title": title + })) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::BunnyApi(format!( + "Failed to create video: {} - {}", + status, body + ))); + } + + let video: CreateVideoResponse = response.json().await?; + info!("Created Bunny video: {}", video.guid); + Ok(video) + } + + /// Upload video binary data to Bunny Stream + pub async fn upload_video(&self, video_id: &str, data: Bytes) -> Result<()> { + let url = format!( + "{}/library/{}/videos/{}", + BUNNY_API_BASE, self.library_id, video_id + ); + + debug!( + "Uploading {} bytes to Bunny video: {}", + data.len(), + video_id + ); + + let response = self + .client + .put(&url) + .header("AccessKey", &self.api_key) + .header("Content-Type", "application/octet-stream") + .body(data) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::BunnyApi(format!( + "Failed to upload video: {} - {}", + status, body + ))); + } + + info!("Uploaded video to Bunny: {}", video_id); + Ok(()) + } + + /// Get video status from Bunny Stream + pub async fn get_video(&self, video_id: &str) -> Result { + let url = format!( + "{}/library/{}/videos/{}", + BUNNY_API_BASE, self.library_id, video_id + ); + + let response = self + .client + .get(&url) + .header("AccessKey", &self.api_key) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::BunnyApi(format!( + "Failed to get video: {} - {}", + status, body + ))); + } + + Ok(response.json().await?) + } + + /// Delete a video from Bunny Stream + pub async fn delete_video(&self, video_id: &str) -> Result<()> { + let url = format!( + "{}/library/{}/videos/{}", + BUNNY_API_BASE, self.library_id, video_id + ); + + let response = self + .client + .delete(&url) + .header("AccessKey", &self.api_key) + .send() + .await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::BunnyApi(format!( + "Failed to delete video: {} - {}", + status, body + ))); + } + + info!("Deleted Bunny video: {}", video_id); + Ok(()) + } + + /// Get the HLS playlist URL for a video + pub fn get_playlist_url(&self, video_id: &str) -> String { + format!( + "https://{}.b-cdn.net/{}/playlist.m3u8", + self.pull_zone, video_id + ) + } + + /// Get the thumbnail URL for a video + pub fn get_thumbnail_url(&self, video_id: &str) -> String { + format!( + "https://{}.b-cdn.net/{}/thumbnail.jpg", + self.pull_zone, video_id + ) + } + + /// Get the pull zone hostname + pub fn pull_zone(&self) -> &str { + &self.pull_zone + } + + /// Download the original video file from Bunny CDN + /// Returns the video bytes + pub async fn download_video(&self, video_id: &str) -> Result { + // The original video is available at the CDN URL with /play.mp4 suffix + let url = format!("https://{}.b-cdn.net/{}/original", self.pull_zone, video_id); + + debug!("Downloading video from Bunny: {}", url); + + let response = self.client.get(&url).send().await?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::BunnyApi(format!( + "Failed to download video: {} - {}", + status, body + ))); + } + + let bytes = response.bytes().await?; + info!("Downloaded {} bytes from Bunny", bytes.len()); + Ok(bytes) + } +} diff --git a/rsky-video/src/bunny/types.rs b/rsky-video/src/bunny/types.rs new file mode 100644 index 00000000..9d639bea --- /dev/null +++ b/rsky-video/src/bunny/types.rs @@ -0,0 +1,137 @@ +//! Bunny Stream API types + +use serde::Deserialize; + +/// Response from creating a new video +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct CreateVideoResponse { + /// Unique video identifier (GUID) + pub guid: String, + /// Video title + pub title: Option, + /// Library ID the video belongs to + pub video_library_id: i64, +} + +/// Video information from Bunny Stream +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct VideoInfo { + /// Unique video identifier + pub guid: String, + /// Video title + pub title: Option, + /// Video library ID + pub video_library_id: i64, + /// Encoding status (0-10) + pub status: i32, + /// Video duration in seconds + #[serde(default)] + pub length: f64, + /// Video width + #[serde(default)] + pub width: i32, + /// Video height + #[serde(default)] + pub height: i32, + /// File size in bytes + #[serde(default)] + pub storage_size: i64, + /// Thumbnail filename + pub thumbnail_file_name: Option, + /// Whether transcoding is complete + #[serde(default)] + pub encode_progress: i32, + /// Available resolutions + #[serde(default)] + pub available_resolutions: Option, +} + +impl VideoInfo { + /// Check if encoding is complete (status 3 or 4) + pub fn is_encoding_complete(&self) -> bool { + self.status == 3 || self.status == 4 + } + + /// Check if encoding failed (status 5) + pub fn is_encoding_failed(&self) -> bool { + self.status == 5 + } + + /// Get encoding progress as percentage + pub fn encoding_progress(&self) -> i32 { + self.encode_progress + } +} + +/// Webhook payload from Bunny Stream +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct WebhookPayload { + /// Video library ID + #[allow(dead_code)] + pub video_library_id: i64, + /// Video GUID + pub video_guid: String, + /// Status code (0-10) + /// 0 = Queued, 1 = Processing, 2 = Encoding, 3 = Finished + /// 4 = Resolution Finished, 5 = Failed + /// 6 = PresignedUploadStarted, 7 = PresignedUploadFinished + /// 8 = PresignedUploadFailed, 9 = CaptionsGenerated + /// 10 = TitleOrDescriptionGenerated + pub status: i32, +} + +impl WebhookPayload { + /// Check if encoding is complete + pub fn is_finished(&self) -> bool { + self.status == 3 + } + + /// Check if a resolution finished (video playable) + pub fn is_resolution_finished(&self) -> bool { + self.status == 4 + } + + /// Check if encoding failed + pub fn is_failed(&self) -> bool { + self.status == 5 + } + + /// Get human-readable status + pub fn status_name(&self) -> &'static str { + match self.status { + 0 => "Queued", + 1 => "Processing", + 2 => "Encoding", + 3 => "Finished", + 4 => "ResolutionFinished", + 5 => "Failed", + 6 => "PresignedUploadStarted", + 7 => "PresignedUploadFinished", + 8 => "PresignedUploadFailed", + 9 => "CaptionsGenerated", + 10 => "TitleOrDescriptionGenerated", + _ => "Unknown", + } + } +} + +/// Bunny encoding status codes +#[allow(dead_code)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(i32)] +pub enum BunnyStatus { + Queued = 0, + Processing = 1, + Encoding = 2, + Finished = 3, + ResolutionFinished = 4, + Failed = 5, + PresignedUploadStarted = 6, + PresignedUploadFinished = 7, + PresignedUploadFailed = 8, + CaptionsGenerated = 9, + TitleOrDescriptionGenerated = 10, +} diff --git a/rsky-video/src/config.rs b/rsky-video/src/config.rs new file mode 100644 index 00000000..5d5e2019 --- /dev/null +++ b/rsky-video/src/config.rs @@ -0,0 +1,79 @@ +//! Configuration for the video service + +use color_eyre::Result; +use std::env; + +/// Application configuration loaded from environment variables +#[derive(Debug, Clone)] +pub struct AppConfig { + /// Host to bind to + pub host: String, + /// Port to listen on + pub port: u16, + /// Database connection URL + pub database_url: String, + + /// Bunny Stream Library ID + pub bunny_library_id: String, + /// Bunny Stream API Key + pub bunny_api_key: String, + /// Bunny Pull Zone hostname (e.g., "blacksky-video.b-cdn.net") + pub bunny_pull_zone: String, + + /// This service's DID (e.g., "did:web:video.blacksky.community") + pub service_did: String, + /// Public URL of this service + pub public_url: String, + /// Path to the signing key PEM file + pub signing_key_path: Option, + + /// Maximum video file size in bytes (default: 100MB) + pub max_video_size: u64, + /// Maximum video duration in seconds (default: 90) + pub max_video_duration: u32, + /// Daily video upload limit per user + pub daily_video_limit: u32, + /// Daily byte upload limit per user (default: 10GB) + pub daily_byte_limit: u64, +} + +impl AppConfig { + /// Load configuration from environment variables + pub fn from_env() -> Result { + Ok(Self { + host: env::var("VIDEO_HOST").unwrap_or_else(|_| "0.0.0.0".to_string()), + port: env::var("VIDEO_PORT") + .ok() + .and_then(|p| p.parse().ok()) + .unwrap_or(3500), + database_url: env::var("DATABASE_URL").expect("DATABASE_URL must be set"), + + bunny_library_id: env::var("BUNNY_LIBRARY_ID").expect("BUNNY_LIBRARY_ID must be set"), + bunny_api_key: env::var("BUNNY_API_KEY").expect("BUNNY_API_KEY must be set"), + bunny_pull_zone: env::var("BUNNY_PULL_ZONE").expect("BUNNY_PULL_ZONE must be set"), + + service_did: env::var("VIDEO_SERVICE_DID") + .unwrap_or_else(|_| "did:web:video.blacksky.community".to_string()), + public_url: env::var("VIDEO_PUBLIC_URL") + .unwrap_or_else(|_| "https://video.blacksky.community".to_string()), + signing_key_path: env::var("SIGNING_KEY_PATH").ok(), + + max_video_size: env::var("MAX_VIDEO_SIZE") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(100_000_000), // 100MB + max_video_duration: env::var("MAX_VIDEO_DURATION") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(90), // 90 seconds + daily_video_limit: env::var("DAILY_VIDEO_LIMIT") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(25), + daily_byte_limit: env::var("DAILY_BYTE_LIMIT") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10_737_418_240), // 10GB + }) + } +} diff --git a/rsky-video/src/db/mod.rs b/rsky-video/src/db/mod.rs new file mode 100644 index 00000000..446cf615 --- /dev/null +++ b/rsky-video/src/db/mod.rs @@ -0,0 +1,444 @@ +//! Database operations for video jobs and quotas + +use chrono::{DateTime, Utc}; +use deadpool_postgres::Pool; +use serde::{Deserialize, Serialize}; +use serde_json::Value as JsonValue; +use tracing::info; +use uuid::Uuid; + +use crate::error::Result; + +/// Video job record +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VideoJob { + pub id: i64, + pub job_id: Uuid, + pub did: String, + pub bunny_video_id: Option, + pub video_cid: Option, + pub pds_blob_ref: Option, + pub state: String, + pub progress: i32, + pub blob_ref: Option, + pub error: Option, + pub message: Option, + pub original_filename: Option, + pub file_size: Option, + pub created_at: DateTime, + pub updated_at: DateTime, +} + +/// Job state constants +#[allow(dead_code)] +pub mod job_state { + pub const CREATED: &str = "JOB_STATE_CREATED"; + pub const UPLOADING: &str = "JOB_STATE_UPLOADING"; + pub const PROCESSING: &str = "JOB_STATE_PROCESSING"; + pub const COMPLETED: &str = "JOB_STATE_COMPLETED"; + pub const FAILED: &str = "JOB_STATE_FAILED"; +} + +/// Upload quota record +#[allow(dead_code)] +#[derive(Debug, Clone)] +pub struct UploadQuota { + pub did: String, + pub daily_videos_used: i32, + pub daily_bytes_used: i64, + pub quota_reset_at: DateTime, +} + +/// Run database migrations +pub async fn run_migrations(pool: &Pool) -> Result<()> { + let client = pool.get().await?; + + client + .execute("CREATE SCHEMA IF NOT EXISTS videos", &[]) + .await?; + + client + .execute( + r#" + CREATE TABLE IF NOT EXISTS videos.video_jobs ( + id BIGSERIAL PRIMARY KEY, + job_id UUID NOT NULL UNIQUE, + did TEXT NOT NULL, + bunny_video_id TEXT, + video_cid TEXT, + state TEXT NOT NULL DEFAULT 'JOB_STATE_CREATED', + progress INTEGER DEFAULT 0, + blob_ref JSONB, + error TEXT, + message TEXT, + original_filename TEXT, + file_size BIGINT, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() + ) + "#, + &[], + ) + .await?; + + client + .execute( + "ALTER TABLE videos.video_jobs ADD COLUMN IF NOT EXISTS video_cid TEXT", + &[], + ) + .await?; + + client + .execute( + "ALTER TABLE videos.video_jobs ADD COLUMN IF NOT EXISTS pds_blob_ref JSONB", + &[], + ) + .await?; + + client + .execute( + "CREATE INDEX IF NOT EXISTS idx_video_jobs_job_id ON videos.video_jobs (job_id)", + &[], + ) + .await?; + + client + .execute( + "CREATE INDEX IF NOT EXISTS idx_video_jobs_bunny_video_id ON videos.video_jobs (bunny_video_id)", + &[], + ) + .await?; + + client + .execute( + "CREATE INDEX IF NOT EXISTS idx_video_jobs_did ON videos.video_jobs (did)", + &[], + ) + .await?; + + client + .execute( + r#" + CREATE TABLE IF NOT EXISTS videos.upload_quotas ( + did TEXT PRIMARY KEY, + daily_videos_used INTEGER DEFAULT 0, + daily_bytes_used BIGINT DEFAULT 0, + quota_reset_at TIMESTAMPTZ DEFAULT NOW(), + created_at TIMESTAMPTZ DEFAULT NOW() + ) + "#, + &[], + ) + .await?; + + client + .execute( + r#" + CREATE TABLE IF NOT EXISTS videos.video_mappings ( + id BIGSERIAL PRIMARY KEY, + did TEXT NOT NULL, + cid TEXT NOT NULL, + bunny_video_id TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE(did, cid) + ) + "#, + &[], + ) + .await?; + + client + .execute( + "CREATE INDEX IF NOT EXISTS idx_video_mappings_did_cid ON videos.video_mappings (did, cid)", + &[], + ) + .await?; + + info!("Database migrations completed"); + Ok(()) +} + +/// Create a new video job +pub async fn create_job( + pool: &Pool, + did: &str, + filename: Option<&str>, + file_size: Option, +) -> Result { + let client = pool.get().await?; + let job_id = Uuid::new_v4(); + + let row = client + .query_one( + r#" + INSERT INTO videos.video_jobs (job_id, did, original_filename, file_size) + VALUES ($1, $2, $3, $4) + RETURNING id, job_id, did, bunny_video_id, video_cid, pds_blob_ref, state, progress, blob_ref, error, message, original_filename, file_size, created_at, updated_at + "#, + &[&job_id, &did, &filename, &file_size], + ) + .await?; + + Ok(row_to_job(&row)) +} + +/// Get a job by job_id +pub async fn get_job(pool: &Pool, job_id: Uuid) -> Result> { + let client = pool.get().await?; + + let row = client + .query_opt( + r#" + SELECT id, job_id, did, bunny_video_id, video_cid, pds_blob_ref, state, progress, blob_ref, error, message, original_filename, file_size, created_at, updated_at + FROM videos.video_jobs + WHERE job_id = $1 + "#, + &[&job_id], + ) + .await?; + + Ok(row.map(|r| row_to_job(&r))) +} + +/// Get a job by bunny_video_id (for webhook handling) +pub async fn get_job_by_bunny_id(pool: &Pool, bunny_video_id: &str) -> Result> { + let client = pool.get().await?; + + let row = client + .query_opt( + r#" + SELECT id, job_id, did, bunny_video_id, video_cid, pds_blob_ref, state, progress, blob_ref, error, message, original_filename, file_size, created_at, updated_at + FROM videos.video_jobs + WHERE bunny_video_id = $1 + "#, + &[&bunny_video_id], + ) + .await?; + + Ok(row.map(|r| row_to_job(&r))) +} + +/// Update job with bunny video ID and content CID +pub async fn set_bunny_video_id( + pool: &Pool, + job_id: Uuid, + bunny_video_id: &str, + video_cid: &str, +) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + r#" + UPDATE videos.video_jobs + SET bunny_video_id = $2, video_cid = $3, state = 'JOB_STATE_UPLOADING', updated_at = NOW() + WHERE job_id = $1 + "#, + &[&job_id, &bunny_video_id, &video_cid], + ) + .await?; + + Ok(()) +} + +/// Store the PDS blob reference (from com.atproto.repo.uploadBlob) +pub async fn set_pds_blob_ref( + pool: &Pool, + job_id: Uuid, + pds_blob_ref: JsonValue, + video_cid: &str, +) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + r#" + UPDATE videos.video_jobs + SET pds_blob_ref = $2, video_cid = $3, updated_at = NOW() + WHERE job_id = $1 + "#, + &[&job_id, &pds_blob_ref, &video_cid], + ) + .await?; + + Ok(()) +} + +/// Update job state +pub async fn update_job_state(pool: &Pool, job_id: Uuid, state: &str, progress: i32) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + r#" + UPDATE videos.video_jobs + SET state = $2, progress = $3, updated_at = NOW() + WHERE job_id = $1 + "#, + &[&job_id, &state, &progress], + ) + .await?; + + Ok(()) +} + +/// Mark job as completed with blob ref +pub async fn complete_job(pool: &Pool, job_id: Uuid, blob_ref: JsonValue) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + r#" + UPDATE videos.video_jobs + SET state = 'JOB_STATE_COMPLETED', progress = 100, blob_ref = $2, updated_at = NOW() + WHERE job_id = $1 + "#, + &[&job_id, &blob_ref], + ) + .await?; + + Ok(()) +} + +/// Mark job as failed +pub async fn fail_job(pool: &Pool, job_id: Uuid, error: &str) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + r#" + UPDATE videos.video_jobs + SET state = 'JOB_STATE_FAILED', error = $2, updated_at = NOW() + WHERE job_id = $1 + "#, + &[&job_id, &error], + ) + .await?; + + Ok(()) +} + +/// Get or create upload quota for a user +pub async fn get_or_create_quota(pool: &Pool, did: &str) -> Result { + let client = pool.get().await?; + let now = Utc::now(); + + let row = client + .query_opt( + "SELECT did, daily_videos_used, daily_bytes_used, quota_reset_at FROM videos.upload_quotas WHERE did = $1", + &[&did], + ) + .await?; + + if let Some(row) = row { + let quota_reset_at: DateTime = row.get(3); + + if now.date_naive() > quota_reset_at.date_naive() { + client + .execute( + "UPDATE videos.upload_quotas SET daily_videos_used = 0, daily_bytes_used = 0, quota_reset_at = $2 WHERE did = $1", + &[&did, &now], + ) + .await?; + + return Ok(UploadQuota { + did: did.to_string(), + daily_videos_used: 0, + daily_bytes_used: 0, + quota_reset_at: now, + }); + } + + return Ok(UploadQuota { + did: row.get(0), + daily_videos_used: row.get(1), + daily_bytes_used: row.get(2), + quota_reset_at, + }); + } + + client + .execute( + "INSERT INTO videos.upload_quotas (did, quota_reset_at) VALUES ($1, $2) ON CONFLICT (did) DO NOTHING", + &[&did, &now], + ) + .await?; + + Ok(UploadQuota { + did: did.to_string(), + daily_videos_used: 0, + daily_bytes_used: 0, + quota_reset_at: now, + }) +} + +/// Increment quota usage +pub async fn increment_quota(pool: &Pool, did: &str, bytes: i64) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + "UPDATE videos.upload_quotas SET daily_videos_used = daily_videos_used + 1, daily_bytes_used = daily_bytes_used + $2 WHERE did = $1", + &[&did, &bytes], + ) + .await?; + + Ok(()) +} + +/// Save video mapping (did/cid -> bunny_video_id) +pub async fn save_video_mapping( + pool: &Pool, + did: &str, + cid: &str, + bunny_video_id: &str, +) -> Result<()> { + let client = pool.get().await?; + + client + .execute( + r#" + INSERT INTO videos.video_mappings (did, cid, bunny_video_id) + VALUES ($1, $2, $3) + ON CONFLICT (did, cid) DO UPDATE SET bunny_video_id = $3 + "#, + &[&did, &cid, &bunny_video_id], + ) + .await?; + + Ok(()) +} + +/// Get bunny video ID from did/cid mapping +pub async fn get_bunny_video_id(pool: &Pool, did: &str, cid: &str) -> Result> { + let client = pool.get().await?; + + let row = client + .query_opt( + "SELECT bunny_video_id FROM videos.video_mappings WHERE did = $1 AND cid = $2", + &[&did, &cid], + ) + .await?; + + Ok(row.map(|r| r.get(0))) +} + +fn row_to_job(row: &tokio_postgres::Row) -> VideoJob { + VideoJob { + id: row.get(0), + job_id: row.get(1), + did: row.get(2), + bunny_video_id: row.get(3), + video_cid: row.get(4), + pds_blob_ref: row.get(5), + state: row.get(6), + progress: row.get(7), + blob_ref: row.get(8), + error: row.get(9), + message: row.get(10), + original_filename: row.get(11), + file_size: row.get(12), + created_at: row.get(13), + updated_at: row.get(14), + } +} diff --git a/rsky-video/src/error.rs b/rsky-video/src/error.rs new file mode 100644 index 00000000..6dde9bfd --- /dev/null +++ b/rsky-video/src/error.rs @@ -0,0 +1,103 @@ +//! Error types for the video service + +use axum::{ + Json, + http::StatusCode, + response::{IntoResponse, Response}, +}; +use serde_json::json; + +pub type Result = std::result::Result; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Unauthorized: {0}")] + Unauthorized(String), + + #[error("Forbidden: {0}")] + Forbidden(String), + + #[error("Not found: {0}")] + NotFound(String), + + #[error("Bad request: {0}")] + BadRequest(String), + + #[error("Rate limited: {0}")] + RateLimited(String), + + #[error("Upload limit exceeded: {0}")] + UploadLimitExceeded(String), + + #[error("Video too large: {0}")] + VideoTooLarge(String), + + #[error("Internal error: {0}")] + Internal(String), + + #[error("Database error: {0}")] + Database(#[from] tokio_postgres::Error), + + #[error("Pool error: {0}")] + Pool(#[from] deadpool_postgres::PoolError), + + #[error("Bunny API error: {0}")] + BunnyApi(String), + + #[error("HTTP error: {0}")] + Http(#[from] reqwest::Error), + + #[error("JSON error: {0}")] + Json(#[from] serde_json::Error), +} + +impl IntoResponse for Error { + fn into_response(self) -> Response { + let (status, error_message) = match &self { + Error::Unauthorized(msg) => (StatusCode::UNAUTHORIZED, msg.clone()), + Error::Forbidden(msg) => (StatusCode::FORBIDDEN, msg.clone()), + Error::NotFound(msg) => (StatusCode::NOT_FOUND, msg.clone()), + Error::BadRequest(msg) => (StatusCode::BAD_REQUEST, msg.clone()), + Error::RateLimited(msg) => (StatusCode::TOO_MANY_REQUESTS, msg.clone()), + Error::UploadLimitExceeded(msg) => (StatusCode::TOO_MANY_REQUESTS, msg.clone()), + Error::VideoTooLarge(msg) => (StatusCode::PAYLOAD_TOO_LARGE, msg.clone()), + Error::Internal(msg) => (StatusCode::INTERNAL_SERVER_ERROR, msg.clone()), + Error::Database(e) => { + tracing::error!("Database error: {}", e); + ( + StatusCode::INTERNAL_SERVER_ERROR, + "Database error".to_string(), + ) + } + Error::Pool(e) => { + tracing::error!("Pool error: {}", e); + ( + StatusCode::INTERNAL_SERVER_ERROR, + "Database pool error".to_string(), + ) + } + Error::BunnyApi(msg) => { + tracing::error!("Bunny API error: {}", msg); + ( + StatusCode::BAD_GATEWAY, + format!("Video service error: {}", msg), + ) + } + Error::Http(e) => { + tracing::error!("HTTP error: {}", e); + (StatusCode::BAD_GATEWAY, "HTTP request failed".to_string()) + } + Error::Json(e) => { + tracing::error!("JSON error: {}", e); + (StatusCode::BAD_REQUEST, "Invalid JSON".to_string()) + } + }; + + let body = Json(json!({ + "error": error_message, + "message": error_message, + })); + + (status, body).into_response() + } +} diff --git a/rsky-video/src/main.rs b/rsky-video/src/main.rs new file mode 100644 index 00000000..e97b32b9 --- /dev/null +++ b/rsky-video/src/main.rs @@ -0,0 +1,171 @@ +//! Blacksky Video Service +//! +//! Handles video uploads, transcoding via Bunny Stream, and playback URL proxying. +//! Implements the app.bsky.video.* lexicon endpoints. + +use std::net::SocketAddr; +use std::sync::Arc; + +use axum::{ + Router, + extract::DefaultBodyLimit, + routing::{get, post}, +}; +use deadpool_postgres::{Config as PgConfig, Runtime}; +use rustls::crypto::aws_lc_rs::default_provider; +use tokio_postgres::NoTls; +use tower_http::cors::{Any, CorsLayer}; +use tower_http::trace::TraceLayer; +use tracing::info; +use tracing_subscriber::{EnvFilter, fmt, prelude::*}; + +mod auth; +mod bunny; +mod config; +mod db; +mod error; +mod pds; +mod signing; +mod xrpc; + +pub use config::AppConfig; +pub use error::{Error, Result}; + +/// Shared application state +pub struct AppState { + pub config: AppConfig, + pub db_pool: deadpool_postgres::Pool, + pub bunny_client: bunny::BunnyClient, + pub pds_client: pds::PdsClient, + pub http_client: reqwest::Client, + pub signer: Option, +} + +#[tokio::main] +async fn main() -> color_eyre::Result<()> { + color_eyre::install()?; + + // Initialize TLS crypto provider + default_provider().install_default().unwrap(); + + // Initialize tracing + tracing_subscriber::registry() + .with(fmt::layer()) + .with( + EnvFilter::try_from_default_env() + .unwrap_or_else(|_| EnvFilter::new("info,rsky_video=debug")), + ) + .init(); + + // Load configuration + let config = AppConfig::from_env()?; + info!( + "Starting Blacksky Video Service on {}:{}", + config.host, config.port + ); + + // Initialize database pool + let mut pg_config = PgConfig::new(); + pg_config.url = Some(config.database_url.clone()); + let db_pool = pg_config.create_pool(Some(Runtime::Tokio1), NoTls)?; + + // Run migrations + db::run_migrations(&db_pool).await?; + + // Initialize Bunny client + let bunny_client = bunny::BunnyClient::new( + config.bunny_library_id.clone(), + config.bunny_api_key.clone(), + config.bunny_pull_zone.clone(), + ); + + // Initialize HTTP client for PDS uploads + let http_client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(300)) + .build()?; + + // Initialize PDS client + let pds_client = pds::PdsClient::new(http_client.clone()); + + // Initialize service auth signer if key is configured + let signer = match &config.signing_key_path { + Some(path) => { + match signing::ServiceAuthSigner::from_pem_file(path, config.service_did.clone()) { + Ok(s) => { + info!("Service auth signing enabled"); + Some(s) + } + Err(e) => { + tracing::warn!( + "Failed to load signing key, PDS uploads will not work: {}", + e + ); + None + } + } + } + None => { + tracing::warn!( + "No signing key configured (SIGNING_KEY_PATH), PDS uploads will not work" + ); + None + } + }; + + // Create shared state + let state = Arc::new(AppState { + config: config.clone(), + db_pool, + bunny_client, + pds_client, + http_client, + signer, + }); + + // Build router + let app = Router::new() + // XRPC endpoints + .route( + "/xrpc/app.bsky.video.getUploadLimits", + get(xrpc::get_upload_limits), + ) + .route("/xrpc/app.bsky.video.uploadVideo", post(xrpc::upload_video)) + .route( + "/xrpc/app.bsky.video.getJobStatus", + get(xrpc::get_job_status), + ) + // Webhook endpoint for Bunny callbacks + .route("/webhook/bunny", post(xrpc::bunny_webhook)) + // Video proxy endpoints + .route("/stream/:did/:cid/playlist.m3u8", get(xrpc::proxy_playlist)) + .route( + "/stream/:did/:cid/thumbnail.jpg", + get(xrpc::proxy_thumbnail), + ) + // Health check + .route("/health", get(health_check)) + .route("/_health", get(health_check)) + // Add middleware + .layer(DefaultBodyLimit::max(5 * 1024 * 1024 * 1024)) // 5GB for video uploads + .layer(TraceLayer::new_for_http()) + .layer( + CorsLayer::new() + .allow_origin(Any) + .allow_methods(Any) + .allow_headers(Any), + ) + .with_state(state); + + // Start server + let addr = SocketAddr::from(([0, 0, 0, 0], config.port)); + info!("Listening on {}", addr); + + let listener = tokio::net::TcpListener::bind(addr).await?; + axum::serve(listener, app).await?; + + Ok(()) +} + +async fn health_check() -> &'static str { + "OK" +} diff --git a/rsky-video/src/pds/mod.rs b/rsky-video/src/pds/mod.rs new file mode 100644 index 00000000..ec40ab22 --- /dev/null +++ b/rsky-video/src/pds/mod.rs @@ -0,0 +1,308 @@ +//! PDS (Personal Data Server) client for uploading blobs +//! +//! Handles uploading video blobs to users' PDS instances. The video service +//! creates its own service auth tokens (signed with its private key) to upload +//! blobs on behalf of users. + +use atrium_api::types::{BlobRef, TypedBlobRef}; +use base64::{Engine as _, engine::general_purpose::URL_SAFE_NO_PAD}; +use bytes::Bytes; +use serde::Deserialize; +use serde_json::Value as JsonValue; +use tracing::{debug, info}; + +use crate::error::{Error, Result}; + +/// JWT claims from service auth token +#[derive(Debug, Deserialize)] +struct ServiceAuthClaims { + /// Issuer (user's DID, signed by their PDS) + iss: String, + /// Audience (PDS DID - where the blob should be uploaded) + aud: String, + /// Subject (user's DID, optional) + #[serde(default)] + sub: Option, + /// Lexicon method + #[serde(default)] + #[allow(dead_code)] + lxm: Option, +} + +/// Response from DID document resolution +#[derive(Debug, Deserialize)] +struct DidDocument { + #[serde(default)] + service: Vec, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct DidService { + id: String, + #[serde(rename = "type")] + service_type: String, + service_endpoint: String, +} + +/// Client for interacting with PDS instances +pub struct PdsClient { + http_client: reqwest::Client, +} + +impl PdsClient { + pub fn new(http_client: reqwest::Client) -> Self { + Self { http_client } + } + + /// Decode a JWT token without verification to extract claims + /// The PDS will verify the token when we use it for upload + fn decode_token_claims(token: &str) -> Result { + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return Err(Error::Unauthorized("Invalid JWT format".to_string())); + } + + let payload = URL_SAFE_NO_PAD + .decode(parts[1]) + .map_err(|e| Error::Unauthorized(format!("Invalid JWT payload encoding: {}", e)))?; + + let claims: ServiceAuthClaims = serde_json::from_slice(&payload) + .map_err(|e| Error::Unauthorized(format!("Invalid JWT claims: {}", e)))?; + + Ok(claims) + } + + /// Extract the PDS DID from a service auth token + pub fn extract_pds_did(token: &str) -> Result { + let claims = Self::decode_token_claims(token)?; + Ok(claims.aud) + } + + /// Extract the user DID from a service auth token + pub fn extract_user_did(token: &str) -> Result { + let claims = Self::decode_token_claims(token)?; + // Use sub if present, otherwise use iss + Ok(claims.sub.unwrap_or(claims.iss)) + } + + /// Resolve a DID to find the PDS endpoint + pub async fn resolve_pds_endpoint(&self, did: &str) -> Result { + // For did:web, we can derive the endpoint directly from the domain + // did:web:example.com -> https://example.com + // This is the standard AT Protocol approach - the PDS endpoint is the domain itself + if did.starts_with("did:web:") { + let domain = did.strip_prefix("did:web:").unwrap(); + let endpoint = format!("https://{}", domain); + + // Optionally try to resolve the DID document for additional verification, + // but fall back to the direct endpoint if it doesn't exist + let url = format!("https://{}/.well-known/did.json", domain); + debug!("Attempting to resolve did:web via: {}", url); + + match self.http_client.get(&url).send().await { + Ok(response) if response.status().is_success() => { + if let Ok(doc) = response.json::().await { + if let Ok(pds_endpoint) = self.extract_pds_from_did_doc(&doc, did) { + info!("Resolved {} via DID document to: {}", did, pds_endpoint); + return Ok(pds_endpoint); + } + } + } + _ => { + // DID document not found or invalid - use direct endpoint + debug!( + "No DID document found for {}, using direct endpoint: {}", + did, endpoint + ); + } + } + + // Fall back to direct endpoint derivation + info!("Using direct endpoint for {}: {}", did, endpoint); + return Ok(endpoint); + } + + // For did:plc, resolve via plc.directory + if did.starts_with("did:plc:") { + let url = format!("https://plc.directory/{}", did); + debug!("Resolving did:plc via plc.directory: {}", url); + + let response = self.http_client.get(&url).send().await?; + if !response.status().is_success() { + return Err(Error::Internal(format!( + "Failed to resolve DID {}: {}", + did, + response.status() + ))); + } + + let doc: DidDocument = response.json().await?; + return self.extract_pds_from_did_doc(&doc, did); + } + + Err(Error::Internal(format!("Unsupported DID method: {}", did))) + } + + /// Extract PDS endpoint from a DID document + fn extract_pds_from_did_doc(&self, doc: &DidDocument, did: &str) -> Result { + for service in &doc.service { + if service.id.ends_with("#atproto_pds") + && service.service_type == "AtprotoPersonalDataServer" + { + info!("Resolved {} to PDS: {}", did, service.service_endpoint); + return Ok(service.service_endpoint.clone()); + } + } + + Err(Error::Internal(format!( + "Could not find PDS endpoint for DID: {}", + did + ))) + } + + /// Upload a blob to a PDS by forwarding the client's service auth token + /// + /// The client provides a service auth token from their PDS with: + /// - iss: user's DID + /// - aud: user's PDS DID + /// - lxm: com.atproto.repo.uploadBlob + /// + /// We forward this token to the PDS, which verifies it against the user's DID document. + /// + /// # Arguments + /// * `client_token` - The service auth token from the client + /// * `user_did` - The user's DID + /// * `data` - The blob data to upload + /// * `mime_type` - MIME type of the blob + /// + /// # Returns + /// The blob reference from the PDS (with valid CID) + pub async fn upload_blob_with_token( + &self, + client_token: &str, + user_did: &str, + data: Bytes, + mime_type: &str, + ) -> Result { + // Resolve user's PDS endpoint from their DID + let pds_endpoint = self.resolve_pds_endpoint(user_did).await?; + + info!("Uploading blob to PDS: {} using client token", pds_endpoint); + let token = client_token; + + // Upload blob via direct HTTP request (not using atrium client) + // atrium's client has issues with the auth header for this use case + let upload_url = format!("{}/xrpc/com.atproto.repo.uploadBlob", pds_endpoint); + let size = data.len(); + debug!("Uploading {} bytes to {}", size, upload_url); + + let response = self + .http_client + .post(&upload_url) + .header("Authorization", format!("Bearer {}", token)) + .header("Content-Type", mime_type) + .body(data.to_vec()) + .send() + .await + .map_err(|e| Error::Internal(format!("PDS upload request failed: {}", e)))?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::Internal(format!( + "PDS upload failed: {} - {}", + status, body + ))); + } + + // Parse response + #[derive(Deserialize)] + struct UploadBlobResponse { + blob: BlobRefResponse, + } + + #[derive(Deserialize)] + #[serde(rename_all = "camelCase")] + struct BlobRefResponse { + #[serde(rename = "$type")] + blob_type: Option, + #[serde(rename = "ref")] + cid_ref: CidRef, + mime_type: String, + size: u64, + } + + #[derive(Deserialize)] + struct CidRef { + #[serde(rename = "$link")] + link: String, + } + + let upload_response: UploadBlobResponse = response + .json() + .await + .map_err(|e| Error::Internal(format!("Failed to parse PDS response: {}", e)))?; + + info!( + "Blob uploaded to PDS: size={}, cid={}", + size, upload_response.blob.cid_ref.link + ); + + // Convert to atrium BlobRef format + // We need to construct the proper BlobRef type + let blob_ref = BlobRef::Typed(TypedBlobRef::Blob(atrium_api::types::Blob { + r#ref: atrium_api::types::CidLink( + cid::Cid::try_from(upload_response.blob.cid_ref.link.as_str()) + .map_err(|e| Error::Internal(format!("Invalid CID from PDS: {}", e)))?, + ), + mime_type: upload_response.blob.mime_type, + size: upload_response.blob.size as usize, + })); + + Ok(blob_ref) + } + + /// Convert an endpoint URL to a did:web + fn endpoint_to_did(&self, endpoint: &str) -> Result { + let url = url::Url::parse(endpoint) + .map_err(|e| Error::Internal(format!("Invalid endpoint URL: {}", e)))?; + + let host = url + .host_str() + .ok_or_else(|| Error::Internal("Endpoint has no host".to_string()))?; + + Ok(format!("did:web:{}", host)) + } +} + +/// Extract the CID string from a BlobRef +pub fn extract_cid(blob: &BlobRef) -> Option { + match blob { + BlobRef::Typed(TypedBlobRef::Blob(b)) => Some(b.r#ref.0.to_string()), + BlobRef::Untyped(u) => Some(u.cid.clone()), + } +} + +/// Convert atrium BlobRef to JSON value for storage +pub fn blob_ref_to_json(blob: &BlobRef) -> JsonValue { + match blob { + BlobRef::Typed(TypedBlobRef::Blob(b)) => { + serde_json::json!({ + "$type": "blob", + "ref": { + "$link": b.r#ref.0.to_string() + }, + "mimeType": b.mime_type, + "size": b.size + }) + } + BlobRef::Untyped(u) => { + // Legacy format - shouldn't happen for new uploads + serde_json::json!({ + "cid": u.cid, + "mimeType": u.mime_type + }) + } + } +} diff --git a/rsky-video/src/signing/mod.rs b/rsky-video/src/signing/mod.rs new file mode 100644 index 00000000..16c9aff2 --- /dev/null +++ b/rsky-video/src/signing/mod.rs @@ -0,0 +1,134 @@ +//! Service authentication token signing +//! +//! The video service needs to create service auth tokens to upload blobs +//! to users' PDS instances. This module handles loading the signing key +//! and creating properly signed JWTs. + +use base64::{Engine as _, engine::general_purpose::URL_SAFE_NO_PAD}; +use k256::ecdsa::{Signature, SigningKey, signature::Signer}; +use k256::pkcs8::DecodePrivateKey; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::path::Path; +use tracing::{debug, info}; + +use crate::error::{Error, Result}; + +/// JWT header for ES256K (secp256k1) +#[derive(Debug, Serialize)] +struct JwtHeader { + alg: &'static str, + typ: &'static str, +} + +/// Service auth token claims +#[derive(Debug, Serialize, Deserialize)] +pub struct ServiceAuthClaims { + /// Issued at timestamp (seconds since epoch) + pub iat: i64, + /// Expiration timestamp (seconds since epoch) + pub exp: i64, + /// Issuer - the video service DID + pub iss: String, + /// Audience - the target PDS DID + pub aud: String, + /// Subject - the user's DID (on whose behalf we're acting) + pub sub: String, + /// Lexicon method being called + pub lxm: String, + /// Unique token ID + pub jti: String, +} + +/// Signer for creating service auth tokens +pub struct ServiceAuthSigner { + signing_key: SigningKey, + service_did: String, +} + +impl ServiceAuthSigner { + /// Load the signing key from a PEM file + pub fn from_pem_file>(path: P, service_did: String) -> Result { + let pem_content = fs::read_to_string(&path) + .map_err(|e| Error::Internal(format!("Failed to read signing key: {}", e)))?; + + // Parse EC private key in PEM format + // First try PKCS#8 format, then fall back to SEC1 format + let signing_key = SigningKey::from_pkcs8_pem(&pem_content) + .or_else(|_| { + // Try SEC1 format (EC PRIVATE KEY) + use k256::SecretKey; + SecretKey::from_sec1_pem(&pem_content).map(|sk| SigningKey::from(sk)) + }) + .map_err(|e| Error::Internal(format!("Failed to parse signing key: {}", e)))?; + + info!("Loaded signing key for {}", service_did); + + Ok(Self { + signing_key, + service_did, + }) + } + + /// Create a service auth token for uploading a blob to a PDS + /// + /// # Arguments + /// * `pds_did` - The DID of the target PDS + /// * `user_did` - The DID of the user on whose behalf we're acting + /// * `ttl_seconds` - How long the token should be valid (default: 300s / 5min) + pub fn create_pds_upload_token( + &self, + pds_did: &str, + user_did: &str, + ttl_seconds: Option, + ) -> Result { + let now = chrono::Utc::now().timestamp(); + let ttl = ttl_seconds.unwrap_or(300); // 5 minutes default + + let claims = ServiceAuthClaims { + iat: now, + exp: now + ttl, + iss: self.service_did.clone(), + aud: pds_did.to_string(), + sub: user_did.to_string(), + lxm: "com.atproto.repo.uploadBlob".to_string(), + jti: uuid::Uuid::new_v4().to_string(), + }; + + debug!( + "Creating service auth token: iss={}, aud={}, sub={}", + claims.iss, claims.aud, claims.sub + ); + + self.sign_jwt(&claims) + } + + /// Sign a JWT with the service's private key + fn sign_jwt(&self, claims: &ServiceAuthClaims) -> Result { + // Create header + let header = JwtHeader { + alg: "ES256K", + typ: "JWT", + }; + + // Encode header and payload + let header_json = serde_json::to_string(&header) + .map_err(|e| Error::Internal(format!("Failed to serialize header: {}", e)))?; + let claims_json = serde_json::to_string(claims) + .map_err(|e| Error::Internal(format!("Failed to serialize claims: {}", e)))?; + + let header_b64 = URL_SAFE_NO_PAD.encode(header_json.as_bytes()); + let claims_b64 = URL_SAFE_NO_PAD.encode(claims_json.as_bytes()); + + // Create signing input + let signing_input = format!("{}.{}", header_b64, claims_b64); + + // Sign with secp256k1 + let signature: Signature = self.signing_key.sign(signing_input.as_bytes()); + let sig_bytes = signature.to_bytes(); + let sig_b64 = URL_SAFE_NO_PAD.encode(&sig_bytes); + + // Combine into JWT + Ok(format!("{}.{}", signing_input, sig_b64)) + } +} diff --git a/rsky-video/src/xrpc/mod.rs b/rsky-video/src/xrpc/mod.rs new file mode 100644 index 00000000..7caae8da --- /dev/null +++ b/rsky-video/src/xrpc/mod.rs @@ -0,0 +1,476 @@ +//! XRPC endpoint handlers for app.bsky.video.* methods + +use std::sync::Arc; + +use axum::{ + Json, + body::Body, + extract::{Path, Query, State}, + http::{HeaderMap, StatusCode, header}, + response::Response, +}; +use bytes::Bytes; +use serde::{Deserialize, Serialize}; +use serde_json::Value as JsonValue; +use tracing::{debug, error, info, warn}; +use uuid::Uuid; + +use crate::{ + AppState, auth, + bunny::WebhookPayload, + db::{self, job_state}, + error::{Error, Result}, + pds, +}; + +/// Query parameters for getUploadLimits +#[derive(Debug, Deserialize)] +pub struct GetUploadLimitsParams {} + +/// Response for getUploadLimits +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct GetUploadLimitsResponse { + pub can_upload: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub remaining_daily_videos: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub remaining_daily_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +/// GET /xrpc/app.bsky.video.getUploadLimits +pub async fn get_upload_limits( + State(state): State>, + headers: HeaderMap, +) -> Result> { + // Validate service auth + let auth_header = headers + .get(header::AUTHORIZATION) + .and_then(|v| v.to_str().ok()); + + let user_did = auth::get_user_did(auth_header, &state.config.service_did)?; + debug!("getUploadLimits for user: {}", user_did); + + // Get user's quota + let quota = db::get_or_create_quota(&state.db_pool, &user_did).await?; + + let remaining_videos = state.config.daily_video_limit as i32 - quota.daily_videos_used; + let remaining_bytes = state.config.daily_byte_limit as i64 - quota.daily_bytes_used; + + // Check if user can upload + let can_upload = remaining_videos > 0 && remaining_bytes > 0; + + let response = if can_upload { + GetUploadLimitsResponse { + can_upload: true, + remaining_daily_videos: Some(remaining_videos), + remaining_daily_bytes: Some(remaining_bytes), + message: None, + error: None, + } + } else if remaining_videos <= 0 { + GetUploadLimitsResponse { + can_upload: false, + remaining_daily_videos: Some(0), + remaining_daily_bytes: Some(remaining_bytes), + message: Some("User has exceeded daily upload videos limit".to_string()), + error: None, + } + } else { + GetUploadLimitsResponse { + can_upload: false, + remaining_daily_videos: Some(remaining_videos), + remaining_daily_bytes: Some(0), + message: Some("User has exceeded daily upload bytes limit".to_string()), + error: None, + } + }; + + Ok(Json(response)) +} + +/// Query parameters for uploadVideo +#[derive(Debug, Deserialize)] +pub struct UploadVideoParams { + pub did: String, + pub name: String, +} + +/// Response for uploadVideo +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct UploadVideoResponse { + pub job_status: JobStatus, +} + +/// Job status in API responses +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct JobStatus { + pub job_id: String, + pub did: String, + pub state: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub progress: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub blob: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub message: Option, +} + +/// POST /xrpc/app.bsky.video.uploadVideo +pub async fn upload_video( + State(state): State>, + headers: HeaderMap, + Query(params): Query, + body: Bytes, +) -> Result> { + // Extract service auth token + let auth_header = headers + .get(header::AUTHORIZATION) + .and_then(|v| v.to_str().ok()); + + // The token should be for com.atproto.repo.uploadBlob with aud: user's PDS DID + // We don't validate audience here since the token is meant for the PDS, not us. + // We forward this token to the PDS for blob upload. + let token = auth::extract_auth_header(auth_header)?; + let claims = auth::decode_service_auth(&token)?; + + // Verify the DID matches + if claims.user_did() != params.did { + return Err(Error::Forbidden( + "Token subject does not match upload DID".to_string(), + )); + } + + let user_did = ¶ms.did; + let file_size = body.len() as i64; + + info!( + "uploadVideo: did={}, name={}, size={}", + user_did, params.name, file_size + ); + + // Check file size + if file_size > state.config.max_video_size as i64 { + return Err(Error::VideoTooLarge(format!( + "file size ({} bytes) is larger than the maximum allowed size ({} bytes)", + file_size, state.config.max_video_size + ))); + } + + // Check quota + let quota = db::get_or_create_quota(&state.db_pool, user_did).await?; + let remaining_videos = state.config.daily_video_limit as i32 - quota.daily_videos_used; + let remaining_bytes = state.config.daily_byte_limit as i64 - quota.daily_bytes_used; + + if remaining_videos <= 0 { + return Err(Error::UploadLimitExceeded( + "User has exceeded daily upload videos limit".to_string(), + )); + } + if remaining_bytes < file_size { + return Err(Error::UploadLimitExceeded( + "User has exceeded daily upload bytes limit".to_string(), + )); + } + + // Create job in database + let job = db::create_job( + &state.db_pool, + user_did, + Some(¶ms.name), + Some(file_size), + ) + .await?; + + let job_id = job.job_id; + info!("Created job: {}", job_id); + + // STEP 1: Upload blob to user's PDS FIRST + // Forward the client's service auth token to the PDS. + // The token should have aud: user's PDS DID (not video service). + info!( + "Uploading blob to PDS for user {} using client token", + user_did + ); + + let pds_blob_ref = match state + .pds_client + .upload_blob_with_token(&token, user_did, body.clone(), "video/mp4") + .await + { + Ok(blob) => blob, + Err(e) => { + error!("Failed to upload blob to PDS: {}", e); + db::fail_job(&state.db_pool, job_id, &format!("PDS upload failed: {}", e)).await?; + return Err(e); + } + }; + + // Extract the CID from the PDS blob_ref - this is the real content-addressed CID + let video_cid = pds::extract_cid(&pds_blob_ref) + .ok_or_else(|| Error::Internal("PDS returned invalid blob reference".to_string()))?; + info!("PDS returned blob with CID: {}", video_cid); + + // Convert to JSON for storage + let pds_blob_json = pds::blob_ref_to_json(&pds_blob_ref); + + // Store the PDS blob_ref in database + db::set_pds_blob_ref(&state.db_pool, job_id, pds_blob_json.clone(), &video_cid).await?; + + // STEP 2: Upload to Bunny Stream for transcoding + // Create video in Bunny Stream + let title = format!("{}_{}", user_did, params.name); + let bunny_video = match state.bunny_client.create_video(&title).await { + Ok(v) => v, + Err(e) => { + error!("Failed to create Bunny video: {}", e); + db::fail_job(&state.db_pool, job_id, &e.to_string()).await?; + return Err(e); + } + }; + + let bunny_video_id = bunny_video.guid.clone(); + db::set_bunny_video_id(&state.db_pool, job_id, &bunny_video_id, &video_cid).await?; + + // Upload video to Bunny for transcoding + if let Err(e) = state.bunny_client.upload_video(&bunny_video_id, body).await { + error!("Failed to upload to Bunny: {}", e); + db::fail_job(&state.db_pool, job_id, &e.to_string()).await?; + return Err(e); + } + + // Update job state to processing + db::update_job_state(&state.db_pool, job_id, job_state::PROCESSING, 0).await?; + + // Increment quota + db::increment_quota(&state.db_pool, user_did, file_size).await?; + + info!( + "Video uploaded: job={}, cid={}, bunny_id={}", + job_id, video_cid, bunny_video_id + ); + + // Return flat JobStatus (not wrapped) - client expects this format + Ok(Json(JobStatus { + job_id: job_id.to_string(), + did: user_did.to_string(), + state: job_state::PROCESSING.to_string(), + progress: Some(0), + blob: None, + error: None, + message: Some("Video is being processed".to_string()), + })) +} + +/// Query parameters for getJobStatus +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct GetJobStatusParams { + pub job_id: String, +} + +/// Response for getJobStatus +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct GetJobStatusResponse { + pub job_status: JobStatus, +} + +/// GET /xrpc/app.bsky.video.getJobStatus +pub async fn get_job_status( + State(state): State>, + Query(params): Query, +) -> Result> { + let job_id = Uuid::parse_str(¶ms.job_id) + .map_err(|_| Error::BadRequest("Invalid job ID format".to_string()))?; + + let job = db::get_job(&state.db_pool, job_id) + .await? + .ok_or_else(|| Error::NotFound("Job not found".to_string()))?; + + // If job is still processing, check Bunny status + let (state_str, progress) = if job.state == job_state::PROCESSING { + if let Some(bunny_id) = &job.bunny_video_id { + match state.bunny_client.get_video(bunny_id).await { + Ok(video_info) => { + if video_info.is_encoding_complete() { + (job_state::PROCESSING.to_string(), 99) + } else if video_info.is_encoding_failed() { + (job_state::FAILED.to_string(), job.progress) + } else { + (job.state.clone(), video_info.encoding_progress()) + } + } + Err(e) => { + warn!("Failed to get Bunny video status: {}", e); + (job.state.clone(), job.progress) + } + } + } else { + (job.state.clone(), job.progress) + } + } else { + (job.state.clone(), job.progress) + }; + + // Return wrapped format - SDK expects response.data.jobStatus + Ok(Json(GetJobStatusResponse { + job_status: JobStatus { + job_id: job.job_id.to_string(), + did: job.did, + state: state_str, + progress: Some(progress), + blob: job.blob_ref, + error: job.error, + message: job.message, + }, + })) +} + +/// POST /webhook/bunny - Handle Bunny Stream webhook callbacks +pub async fn bunny_webhook( + State(state): State>, + Json(payload): Json, +) -> Result { + info!( + "Bunny webhook: video={}, status={} ({})", + payload.video_guid, + payload.status, + payload.status_name() + ); + + // Find the job by bunny video ID + let job = match db::get_job_by_bunny_id(&state.db_pool, &payload.video_guid).await? { + Some(j) => j, + None => { + warn!("Webhook for unknown video: {}", payload.video_guid); + return Ok(StatusCode::OK); + } + }; + + if payload.is_finished() || payload.is_resolution_finished() { + // Video encoding is complete + info!("Video encoding complete: job={}", job.job_id); + + // Get the content CID from the job (from PDS upload) + let video_cid = job + .video_cid + .ok_or_else(|| Error::Internal("Job missing video CID".to_string()))?; + + // Use the PDS blob_ref that was stored during upload + // This is the real blob reference from the user's PDS + let blob_ref = job + .pds_blob_ref + .ok_or_else(|| Error::Internal("Job missing PDS blob reference".to_string()))?; + + // Save the mapping for URL proxy: (did, cid) -> bunny_video_id + db::save_video_mapping(&state.db_pool, &job.did, &video_cid, &payload.video_guid).await?; + + // Mark job as complete with the PDS blob_ref + db::complete_job(&state.db_pool, job.job_id, blob_ref).await?; + + info!("Job completed: job={}, cid={}", job.job_id, video_cid); + } else if payload.is_failed() { + // Video encoding failed + error!("Video encoding failed: job={}", job.job_id); + db::fail_job(&state.db_pool, job.job_id, "Video encoding failed").await?; + } else { + // Update progress + let progress = match payload.status { + 0 => 0, // Queued + 1 => 10, // Processing + 2 => 50, // Encoding + _ => job.progress, + }; + db::update_job_state(&state.db_pool, job.job_id, job_state::PROCESSING, progress).await?; + } + + Ok(StatusCode::OK) +} + +/// Path parameters for video proxy +#[derive(Debug, Deserialize)] +pub struct VideoProxyPath { + pub did: String, + pub cid: String, +} + +/// GET /stream/:did/:cid/playlist.m3u8 - Proxy HLS playlist +pub async fn proxy_playlist( + State(state): State>, + Path(path): Path, +) -> Result { + let did = urlencoding::decode(&path.did) + .map_err(|_| Error::BadRequest("Invalid DID encoding".to_string()))?; + let cid = urlencoding::decode(&path.cid) + .map_err(|_| Error::BadRequest("Invalid CID encoding".to_string()))?; + + debug!("Proxy playlist: did={}, cid={}", did, cid); + + // Look up the bunny video ID in our database + let redirect_url = match db::get_bunny_video_id(&state.db_pool, &did, &cid).await? { + Some(bunny_video_id) => { + // Video is in our system - redirect to Bunny CDN + state.bunny_client.get_playlist_url(&bunny_video_id) + } + None => { + // Video not in our system - fallback to Bluesky's video CDN + debug!( + "Video not in our DB, falling back to Bluesky CDN: did={}, cid={}", + did, cid + ); + format!("https://video.bsky.app/watch/{}/{}/playlist.m3u8", did, cid) + } + }; + + Ok(Response::builder() + .status(StatusCode::TEMPORARY_REDIRECT) + .header(header::LOCATION, redirect_url) + .header(header::CACHE_CONTROL, "public, max-age=3600") + .body(Body::empty()) + .unwrap()) +} + +/// GET /stream/:did/:cid/thumbnail.jpg - Proxy thumbnail +pub async fn proxy_thumbnail( + State(state): State>, + Path(path): Path, +) -> Result { + let did = urlencoding::decode(&path.did) + .map_err(|_| Error::BadRequest("Invalid DID encoding".to_string()))?; + let cid = urlencoding::decode(&path.cid) + .map_err(|_| Error::BadRequest("Invalid CID encoding".to_string()))?; + + debug!("Proxy thumbnail: did={}, cid={}", did, cid); + + // Look up the bunny video ID in our database + let redirect_url = match db::get_bunny_video_id(&state.db_pool, &did, &cid).await? { + Some(bunny_video_id) => { + // Video is in our system - redirect to Bunny CDN + state.bunny_client.get_thumbnail_url(&bunny_video_id) + } + None => { + // Video not in our system - fallback to Bluesky's video CDN + debug!( + "Video not in our DB, falling back to Bluesky CDN: did={}, cid={}", + did, cid + ); + format!("https://video.bsky.app/watch/{}/{}/thumbnail.jpg", did, cid) + } + }; + + Ok(Response::builder() + .status(StatusCode::TEMPORARY_REDIRECT) + .header(header::LOCATION, redirect_url) + .header(header::CACHE_CONTROL, "public, max-age=86400") + .body(Body::empty()) + .unwrap()) +} diff --git a/rsky-wintermute/migrations/add_notification_unique_constraint.sql b/rsky-wintermute/migrations/add_notification_unique_constraint.sql new file mode 100644 index 00000000..16cf109b --- /dev/null +++ b/rsky-wintermute/migrations/add_notification_unique_constraint.sql @@ -0,0 +1,25 @@ +-- Add unique constraint to notification table to prevent duplicates +-- This migration should be run AFTER deduplication (see dedupe_notifications.sql) +-- Or on a database with few/no duplicates + +-- Step 1: Delete duplicates, keeping the row with the lowest id +-- This uses a CTE with ROW_NUMBER to identify duplicates +DELETE FROM bsky.notification +WHERE id IN ( + SELECT id FROM ( + SELECT id, + ROW_NUMBER() OVER (PARTITION BY did, "recordUri", reason ORDER BY id) as rn + FROM bsky.notification + ) sub + WHERE rn > 1 +); + +-- Step 2: Add the unique constraint +-- Using CONCURRENTLY to avoid blocking other operations +CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS notification_did_recorduri_reason_unique_idx +ON bsky.notification (did, "recordUri", reason); + +-- Verify the constraint was created +SELECT indexname, indexdef +FROM pg_indexes +WHERE schemaname = 'bsky' AND tablename = 'notification'; diff --git a/rsky-wintermute/migrations/backfill_post_embeds.sql b/rsky-wintermute/migrations/backfill_post_embeds.sql new file mode 100644 index 00000000..2a5d2bde --- /dev/null +++ b/rsky-wintermute/migrations/backfill_post_embeds.sql @@ -0,0 +1,63 @@ +-- Backfill post_embed_image and post_embed_video tables from existing record JSON +-- Run this migration after deploying the indexer fix + +-- Step 1: Backfill post_embed_image from app.bsky.embed.images +INSERT INTO post_embed_image ("postUri", position, "imageCid", alt) +SELECT + r.uri as "postUri", + (img_idx.idx - 1)::text as position, + img.value->'image'->'ref'->>'$link' as "imageCid", + COALESCE(img.value->>'alt', '') as alt +FROM record r, + jsonb_array_elements((r.json::jsonb)->'embed'->'images') WITH ORDINALITY AS img(value, idx), + LATERAL (SELECT img.idx as idx) img_idx +WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.images' + AND img.value->'image'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Step 2: Backfill post_embed_image from app.bsky.embed.recordWithMedia (images in media) +INSERT INTO post_embed_image ("postUri", position, "imageCid", alt) +SELECT + r.uri as "postUri", + (img_idx.idx - 1)::text as position, + img.value->'image'->'ref'->>'$link' as "imageCid", + COALESCE(img.value->>'alt', '') as alt +FROM record r, + jsonb_array_elements((r.json::jsonb)->'embed'->'media'->'images') WITH ORDINALITY AS img(value, idx), + LATERAL (SELECT img.idx as idx) img_idx +WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (r.json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.images' + AND img.value->'image'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Step 3: Backfill post_embed_video from app.bsky.embed.video +INSERT INTO post_embed_video ("postUri", "videoCid", alt) +SELECT + r.uri as "postUri", + (r.json::jsonb)->'embed'->'video'->'ref'->>'$link' as "videoCid", + (r.json::jsonb)->'embed'->>'alt' as alt +FROM record r +WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.video' + AND (r.json::jsonb)->'embed'->'video'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Step 4: Backfill post_embed_video from app.bsky.embed.recordWithMedia (video in media) +INSERT INTO post_embed_video ("postUri", "videoCid", alt) +SELECT + r.uri as "postUri", + (r.json::jsonb)->'embed'->'media'->'video'->'ref'->>'$link' as "videoCid", + (r.json::jsonb)->'embed'->'media'->>'alt' as alt +FROM record r +WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (r.json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.video' + AND (r.json::jsonb)->'embed'->'media'->'video'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Verification queries (run these to check progress) +-- SELECT COUNT(*) FROM post_embed_image; +-- SELECT COUNT(*) FROM post_embed_video; +-- SELECT COUNT(*) FROM record WHERE uri LIKE 'at://%/app.bsky.feed.post/%' AND (json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.images'; diff --git a/rsky-wintermute/migrations/backfill_post_embeds_batched.sql b/rsky-wintermute/migrations/backfill_post_embeds_batched.sql new file mode 100644 index 00000000..7f75bc28 --- /dev/null +++ b/rsky-wintermute/migrations/backfill_post_embeds_batched.sql @@ -0,0 +1,131 @@ +-- Batched backfill for large tables (prevents long locks) +-- Run each batch separately, adjusting LIMIT/OFFSET as needed + +-- Check total counts first +SELECT + 'images' as type, + COUNT(*) as total +FROM record +WHERE uri LIKE 'at://%/app.bsky.feed.post/%' + AND ( + (json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.images' + OR ( + (json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.images' + ) + ) +UNION ALL +SELECT + 'videos' as type, + COUNT(*) as total +FROM record +WHERE uri LIKE 'at://%/app.bsky.feed.post/%' + AND ( + (json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.video' + OR ( + (json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.video' + ) + ); + +-- Batched image backfill (adjust LIMIT and run multiple times) +-- Batch 1: Direct image embeds +WITH batch AS ( + SELECT r.uri, r.json + FROM record r + LEFT JOIN post_embed_image pei ON pei."postUri" = r.uri + WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.images' + AND pei."postUri" IS NULL + LIMIT 10000 +) +INSERT INTO post_embed_image ("postUri", position, "imageCid", alt) +SELECT + b.uri as "postUri", + (img_idx.idx - 1)::text as position, + img.value->'image'->'ref'->>'$link' as "imageCid", + COALESCE(img.value->>'alt', '') as alt +FROM batch b, + jsonb_array_elements((b.json::jsonb)->'embed'->'images') WITH ORDINALITY AS img(value, idx), + LATERAL (SELECT img.idx as idx) img_idx +WHERE img.value->'image'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Batch 2: Images in recordWithMedia +WITH batch AS ( + SELECT r.uri, r.json + FROM record r + LEFT JOIN post_embed_image pei ON pei."postUri" = r.uri + WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (r.json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.images' + AND pei."postUri" IS NULL + LIMIT 10000 +) +INSERT INTO post_embed_image ("postUri", position, "imageCid", alt) +SELECT + b.uri as "postUri", + (img_idx.idx - 1)::text as position, + img.value->'image'->'ref'->>'$link' as "imageCid", + COALESCE(img.value->>'alt', '') as alt +FROM batch b, + jsonb_array_elements((b.json::jsonb)->'embed'->'media'->'images') WITH ORDINALITY AS img(value, idx), + LATERAL (SELECT img.idx as idx) img_idx +WHERE img.value->'image'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Batched video backfill +-- Batch 1: Direct video embeds +WITH batch AS ( + SELECT r.uri, r.json + FROM record r + LEFT JOIN post_embed_video pev ON pev."postUri" = r.uri + WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.video' + AND pev."postUri" IS NULL + LIMIT 10000 +) +INSERT INTO post_embed_video ("postUri", "videoCid", alt) +SELECT + b.uri as "postUri", + (b.json::jsonb)->'embed'->'video'->'ref'->>'$link' as "videoCid", + (b.json::jsonb)->'embed'->>'alt' as alt +FROM batch b +WHERE (b.json::jsonb)->'embed'->'video'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Batch 2: Videos in recordWithMedia +WITH batch AS ( + SELECT r.uri, r.json + FROM record r + LEFT JOIN post_embed_video pev ON pev."postUri" = r.uri + WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (r.json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.video' + AND pev."postUri" IS NULL + LIMIT 10000 +) +INSERT INTO post_embed_video ("postUri", "videoCid", alt) +SELECT + b.uri as "postUri", + (b.json::jsonb)->'embed'->'media'->'video'->'ref'->>'$link' as "videoCid", + (b.json::jsonb)->'embed'->'media'->>'alt' as alt +FROM batch b +WHERE (b.json::jsonb)->'embed'->'media'->'video'->'ref'->>'$link' IS NOT NULL +ON CONFLICT DO NOTHING; + +-- Check remaining after each batch +SELECT + 'remaining_images' as metric, + COUNT(*) as count +FROM record r +LEFT JOIN post_embed_image pei ON pei."postUri" = r.uri +WHERE r.uri LIKE 'at://%/app.bsky.feed.post/%' + AND ( + (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.images' + OR ( + (r.json::jsonb)->'embed'->>'$type' = 'app.bsky.embed.recordWithMedia' + AND (r.json::jsonb)->'embed'->'media'->>'$type' = 'app.bsky.embed.images' + ) + ) + AND pei."postUri" IS NULL; diff --git a/rsky-wintermute/migrations/dedupe_notifications.sql b/rsky-wintermute/migrations/dedupe_notifications.sql new file mode 100644 index 00000000..2eed99c4 --- /dev/null +++ b/rsky-wintermute/migrations/dedupe_notifications.sql @@ -0,0 +1,38 @@ +-- Deduplicate notifications and add unique constraint +-- WARNING: This migration operates on 1+ billion rows and will take many hours + +-- Step 1: Create a temporary table with distinct notifications +-- Using ROW_NUMBER() to keep only the first occurrence (lowest id) of each duplicate +CREATE TABLE bsky.notification_deduped AS +SELECT id, did, "recordUri", "recordCid", author, reason, "reasonSubject", "sortAt" +FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY did, "recordUri", reason ORDER BY id) as rn + FROM bsky.notification +) sub +WHERE rn = 1; + +-- Step 2: Create indexes on the new table (before swap to minimize downtime) +CREATE INDEX notification_deduped_did_sortat_idx ON bsky.notification_deduped (did, "sortAt"); +ALTER TABLE bsky.notification_deduped ADD PRIMARY KEY (id); + +-- Step 3: Add the unique constraint +CREATE UNIQUE INDEX notification_deduped_unique_idx +ON bsky.notification_deduped (did, "recordUri", reason); + +-- Step 4: Swap the tables +-- IMPORTANT: Do this during a maintenance window +BEGIN; +ALTER TABLE bsky.notification RENAME TO notification_old; +ALTER TABLE bsky.notification_deduped RENAME TO notification; +-- Update the sequence to continue from the max id +SELECT setval('bsky.notification_id_seq', (SELECT MAX(id) FROM bsky.notification)); +COMMIT; + +-- Step 5: Drop the old table (after verifying everything works) +-- DROP TABLE bsky.notification_old; + +-- Verification queries: +-- SELECT COUNT(*) FROM bsky.notification; +-- SELECT COUNT(*) FROM bsky.notification_old; +-- SELECT (SELECT COUNT(*) FROM bsky.notification_old) - (SELECT COUNT(*) FROM bsky.notification) as duplicates_removed; diff --git a/rsky-wintermute/src/bin/direct_index.rs b/rsky-wintermute/src/bin/direct_index.rs new file mode 100644 index 00000000..6b5f0e34 --- /dev/null +++ b/rsky-wintermute/src/bin/direct_index.rs @@ -0,0 +1,268 @@ +use std::io::Cursor; +use std::sync::Arc; + +use clap::Parser; +use color_eyre::Result; +use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod, Runtime}; +use iroh_car::CarReader; +use rsky_identity::IdResolver; +use rsky_identity::types::IdentityResolverOpts; +use rsky_repo::readable_repo::ReadableRepo; +use rsky_repo::storage::memory_blockstore::MemoryBlockstore; +use rsky_syntax::aturi::AtUri; +use tokio_postgres::NoTls; + +use rsky_repo::parse::get_and_parse_record; +use rsky_wintermute::backfiller::convert_record_to_ipld; +use rsky_wintermute::indexer::IndexerManager; +use rsky_wintermute::types::{IndexJob, WriteAction}; + +#[derive(Debug, Parser)] +#[command(name = "direct_index")] +#[command(about = "Directly fetch and index a repo, bypassing queues")] +struct Args { + /// DIDs to index (comma-separated or multiple --did flags) + #[arg(long = "did", num_args = 1..)] + dids: Vec, + + /// PostgreSQL connection URL + #[arg(long, env = "DATABASE_URL")] + database_url: String, +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + tracing_subscriber::fmt::init(); + + let args = Args::parse(); + + // Parse all DIDs from args (supporting comma-separated) + let dids: Vec = args + .dids + .iter() + .flat_map(|d| d.split(',').map(|s| s.trim().to_string())) + .filter(|d| !d.is_empty() && d.starts_with("did:")) + .collect(); + + if dids.is_empty() { + eprintln!("No valid DIDs provided"); + return Ok(()); + } + + println!("Will index {} DIDs directly to PostgreSQL", dids.len()); + + // Setup database pool + let mut cfg = Config::new(); + cfg.url = Some(args.database_url.clone()); + cfg.manager = Some(ManagerConfig { + recycling_method: RecyclingMethod::Fast, + }); + + let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls)?; + let http_client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(120)) + .build()?; + + for did in &dids { + println!("\n=== Processing {} ===", did); + match process_did(&pool, &http_client, did).await { + Ok(count) => println!("Successfully indexed {} records for {}", count, did), + Err(e) => eprintln!("Failed to index {}: {}", did, e), + } + } + + println!("\nDone!"); + Ok(()) +} + +async fn process_did( + pool: &deadpool_postgres::Pool, + http_client: &reqwest::Client, + did: &str, +) -> Result { + // Resolve DID to get PDS endpoint + let resolver_opts = IdentityResolverOpts { + timeout: None, + plc_url: None, + did_cache: None, + backup_nameservers: None, + }; + let mut resolver = IdResolver::new(resolver_opts); + let doc = resolver + .did + .resolve(did.to_string(), None) + .await + .map_err(|e| color_eyre::eyre::eyre!("DID resolution error: {}", e))? + .ok_or_else(|| color_eyre::eyre::eyre!("DID resolution failed"))?; + + let mut pds_endpoint = None; + if let Some(services) = &doc.service { + for service in services { + if service.r#type == "AtprotoPersonalDataServer" || service.id == "#atproto_pds" { + pds_endpoint = Some(service.service_endpoint.clone()); + break; + } + } + } + + let pds_endpoint = + pds_endpoint.ok_or_else(|| color_eyre::eyre::eyre!("No PDS endpoint found"))?; + + println!(" PDS: {}", pds_endpoint); + + // Fetch CAR file + let repo_url = format!("{pds_endpoint}/xrpc/com.atproto.sync.getRepo?did={did}"); + println!(" Fetching CAR..."); + let response = http_client.get(&repo_url).send().await?; + + if !response.status().is_success() { + return Err(color_eyre::eyre::eyre!("HTTP error: {}", response.status())); + } + + let car_bytes = response.bytes().await?; + println!(" CAR size: {} bytes", car_bytes.len()); + + // Parse CAR file + let mut reader = CarReader::new(Cursor::new(car_bytes.to_vec())) + .await + .map_err(|e| color_eyre::eyre::eyre!("Failed to parse CAR file: {}", e))?; + let root = *reader + .header() + .roots() + .first() + .ok_or_else(|| color_eyre::eyre::eyre!("No root CID"))?; + + let mut blocks = rsky_repo::block_map::BlockMap::new(); + while let Some((cid, data)) = reader + .next_block() + .await + .map_err(|e| color_eyre::eyre::eyre!("Failed to read block: {}", e))? + { + blocks.set(cid, data.clone()); + } + + let blockstore = MemoryBlockstore::new(Some(blocks)) + .await + .map_err(|e| color_eyre::eyre::eyre!("Failed to create blockstore: {}", e))?; + let storage_arc = Arc::new(tokio::sync::RwLock::new(blockstore)); + + let mut repo = ReadableRepo::load(storage_arc, root) + .await + .map_err(|e| color_eyre::eyre::eyre!("Failed to load repo: {}", e))?; + + if repo.did() != did { + return Err(color_eyre::eyre::eyre!( + "DID mismatch: expected {}, got {}", + did, + repo.did() + )); + } + + // Get all records + let leaves = repo + .data + .list(None, None, None) + .await + .map_err(|e| color_eyre::eyre::eyre!("Failed to list records: {}", e))?; + println!(" Found {} records", leaves.len()); + + let blocks_result = { + let storage_guard = repo.storage.read().await; + storage_guard + .get_blocks(leaves.iter().map(|e| e.value).collect()) + .await + .map_err(|e| color_eyre::eyre::eyre!("Failed to get blocks: {}", e))? + }; + + let rev = repo.commit.rev.clone(); + let now = chrono::Utc::now() + .format("%Y-%m-%dT%H:%M:%S%.3fZ") + .to_string(); + + let mut indexed_count = 0; + let mut skipped_count = 0; + + for entry in &leaves { + let uri_string = format!("at://{did}/{}", entry.key); + let Ok(uri) = AtUri::new(uri_string, None) else { + skipped_count += 1; + continue; + }; + + let collection = uri.get_collection(); + let rkey = uri.get_rkey(); + + // Filter to bsky/chat records + if !collection.starts_with("app.bsky.") && !collection.starts_with("chat.bsky.") { + skipped_count += 1; + continue; + } + + if let Ok(parsed) = get_and_parse_record(&blocks_result.blocks, entry.value) { + let record_json_raw = serde_json::to_value(&parsed.record)?; + let record_json = convert_record_to_ipld(&record_json_raw); + + let uri_string = format!("at://{did}/{collection}/{rkey}"); + let uri = AtUri::new(uri_string.clone(), None) + .map_err(|e| color_eyre::eyre::eyre!("Invalid URI {}: {}", uri_string, e))?; + let cid = entry.value.to_string(); + + let job = IndexJob { + uri: uri.to_string(), + cid, + action: WriteAction::Create, + record: Some(record_json), + indexed_at: now.clone(), + rev: rev.clone(), + }; + + // Index directly to PostgreSQL + if let Err(e) = IndexerManager::process_job(pool, &job).await { + eprintln!(" Warning: failed to index {}: {}", job.uri, e); + } else { + indexed_count += 1; + } + } else { + skipped_count += 1; + } + + if indexed_count > 0 && indexed_count % 100 == 0 { + print!("\r Indexed {} records...", indexed_count); + } + } + + println!("\r Indexed: {}, Skipped: {}", indexed_count, skipped_count); + + // Update profile_agg + println!(" Updating profile aggregates..."); + let client = pool.get().await?; + client + .execute( + "INSERT INTO profile_agg (did, \"postsCount\") + SELECT $1::varchar, COUNT(*) FROM post WHERE creator = $1::varchar + ON CONFLICT (did) DO UPDATE SET \"postsCount\" = EXCLUDED.\"postsCount\"", + &[&did], + ) + .await?; + + client + .execute( + "INSERT INTO profile_agg (did, \"followsCount\") + SELECT $1::varchar, COUNT(*) FROM follow WHERE creator = $1::varchar + ON CONFLICT (did) DO UPDATE SET \"followsCount\" = EXCLUDED.\"followsCount\"", + &[&did], + ) + .await?; + + client + .execute( + "INSERT INTO profile_agg (did, \"followersCount\") + SELECT $1::varchar, COUNT(*) FROM follow WHERE \"subjectDid\" = $1::varchar + ON CONFLICT (did) DO UPDATE SET \"followersCount\" = EXCLUDED.\"followersCount\"", + &[&did], + ) + .await?; + + Ok(indexed_count) +} diff --git a/rsky-wintermute/src/bin/queue_backfill.rs b/rsky-wintermute/src/bin/queue_backfill.rs index 2757993a..e87a050b 100644 --- a/rsky-wintermute/src/bin/queue_backfill.rs +++ b/rsky-wintermute/src/bin/queue_backfill.rs @@ -51,6 +51,9 @@ enum Command { /// Queue with normal priority instead of high priority #[arg(long, default_value = "false")] normal_priority: bool, + /// Queue with immediate priority (processed first, before all other items) + #[arg(long, default_value = "false")] + immediate: bool, }, /// Show current queue status Status, @@ -120,7 +123,8 @@ fn main() -> Result<()> { Command::Dids { dids, normal_priority, - } => queue_dids(&storage, &dids, !normal_priority), // DIDs use priority by default + immediate, + } => queue_dids(&storage, &dids, !normal_priority, immediate), Command::Status => show_status(&storage), Command::Peek { count } => peek_queue(&storage, count), Command::Search { did, limit } => search_queue(&storage, &did, limit), @@ -276,11 +280,17 @@ async fn queue_from_pds(storage: &Storage, host: &str, priority: bool) -> Result Ok(()) } -fn queue_dids(storage: &Storage, dids: &[String], priority: bool) -> Result<()> { +fn queue_dids(storage: &Storage, dids: &[String], priority: bool, immediate: bool) -> Result<()> { let mut queued = 0; let mut skipped = 0; - let priority_str = if priority { "HIGH PRIORITY" } else { "normal" }; + let priority_str = if immediate { + "IMMEDIATE" + } else if priority { + "HIGH PRIORITY" + } else { + "normal" + }; println!("Queuing DIDs with {priority_str} priority"); for did_arg in dids { @@ -301,10 +311,12 @@ fn queue_dids(storage: &Storage, dids: &[String], priority: bool) -> Result<()> let job = BackfillJob { did: did.to_string(), retry_count: 0, - priority, + priority: priority || immediate, }; - if priority { + if immediate { + storage.enqueue_backfill_immediate(&job)?; + } else if priority { storage.enqueue_backfill_priority(&job)?; } else { storage.enqueue_backfill(&job)?; diff --git a/rsky-wintermute/src/indexer/bulk.rs b/rsky-wintermute/src/indexer/bulk.rs index 193f8044..1b6f80e8 100644 --- a/rsky-wintermute/src/indexer/bulk.rs +++ b/rsky-wintermute/src/indexer/bulk.rs @@ -269,15 +269,30 @@ pub async fn copy_insert_posts( .await?; let insert_ms = insert_start.elapsed().as_millis(); + // Phase 4: Update profile_agg postsCount for affected creators + let agg_start = Instant::now(); + client + .execute( + "INSERT INTO profile_agg (did, \"postsCount\") + SELECT creator, COUNT(*) FROM post + WHERE creator IN (SELECT DISTINCT creator FROM _bulk_post) + GROUP BY creator + ON CONFLICT (did) DO UPDATE SET \"postsCount\" = EXCLUDED.\"postsCount\"", + &[], + ) + .await?; + let agg_ms = agg_start.elapsed().as_millis(); + // Log if total > 100ms (worth investigating) - let total_ms = setup_ms + copy_ms + insert_ms; + let total_ms = setup_ms + copy_ms + insert_ms + agg_ms; if total_ms > 100 { tracing::warn!( - "SLOW post bulk: {}ms total (setup={}ms, copy={}ms, insert={}ms) for {} rows", + "SLOW post bulk: {}ms total (setup={}ms, copy={}ms, insert={}ms, agg={}ms) for {} rows", total_ms, setup_ms, copy_ms, insert_ms, + agg_ms, count ); } @@ -519,15 +534,42 @@ pub async fn copy_insert_follows( .await?; let insert_ms = insert_start.elapsed().as_millis(); + // Phase 4: Update profile_agg followsCount and followersCount + let agg_start = Instant::now(); + // Update followsCount for creators (those who are following) + client + .execute( + "INSERT INTO profile_agg (did, \"followsCount\") + SELECT creator, COUNT(*) FROM follow + WHERE creator IN (SELECT DISTINCT creator FROM _bulk_follow) + GROUP BY creator + ON CONFLICT (did) DO UPDATE SET \"followsCount\" = EXCLUDED.\"followsCount\"", + &[], + ) + .await?; + // Update followersCount for subjects (those who are followed) + client + .execute( + "INSERT INTO profile_agg (did, \"followersCount\") + SELECT \"subjectDid\", COUNT(*) FROM follow + WHERE \"subjectDid\" IN (SELECT DISTINCT subject_did FROM _bulk_follow) + GROUP BY \"subjectDid\" + ON CONFLICT (did) DO UPDATE SET \"followersCount\" = EXCLUDED.\"followersCount\"", + &[], + ) + .await?; + let agg_ms = agg_start.elapsed().as_millis(); + // Log if total > 100ms (worth investigating) - let total_ms = setup_ms + copy_ms + insert_ms; + let total_ms = setup_ms + copy_ms + insert_ms + agg_ms; if total_ms > 100 { tracing::warn!( - "SLOW follow bulk: {}ms total (setup={}ms, copy={}ms, insert={}ms) for {} rows", + "SLOW follow bulk: {}ms total (setup={}ms, copy={}ms, insert={}ms, agg={}ms) for {} rows", total_ms, setup_ms, copy_ms, insert_ms, + agg_ms, count ); } @@ -702,6 +744,184 @@ pub async fn copy_insert_blocks( Ok(()) } +/// Bulk insert `post_embed_image` records using `COPY` protocol. +pub async fn copy_insert_post_embed_images( + client: &deadpool_postgres::Client, + data: &[(String, String, String, String)], // post_uri, position, image_cid, alt +) -> Result<(), WintermuteError> { + use std::time::Instant; + + if data.is_empty() { + return Ok(()); + } + + let count = data.len(); + + // Phase 1: Table setup + let setup_start = Instant::now(); + client + .execute( + "CREATE TEMP TABLE IF NOT EXISTS _bulk_post_embed_image ( + post_uri text NOT NULL, + position text NOT NULL, + image_cid text NOT NULL, + alt text NOT NULL + )", + &[], + ) + .await?; + + client + .execute("TRUNCATE _bulk_post_embed_image", &[]) + .await?; + let setup_ms = setup_start.elapsed().as_millis(); + + // Phase 2: COPY data + let copy_start = Instant::now(); + let copy_stmt = client + .copy_in("COPY _bulk_post_embed_image (post_uri, position, image_cid, alt) FROM STDIN WITH (FORMAT text, DELIMITER E'\\t')") + .await?; + + let sink = copy_stmt; + pin_mut!(sink); + + let mut buffer = Vec::with_capacity(data.len() * 150); + for (post_uri, position, image_cid, alt) in data { + // Escape alt text for tabs/newlines + let escaped_alt: String = alt + .chars() + .map(|c| match c { + '\t' | '\n' | '\r' => ' ', + _ => c, + }) + .collect(); + writeln!(buffer, "{post_uri}\t{position}\t{image_cid}\t{escaped_alt}") + .map_err(|e| WintermuteError::Other(format!("buffer write error: {e}")))?; + } + + sink.send(bytes::Bytes::from(buffer)).await?; + sink.close().await?; + let copy_ms = copy_start.elapsed().as_millis(); + + // Phase 3: INSERT...ON CONFLICT + let insert_start = Instant::now(); + client + .execute( + "INSERT INTO post_embed_image (\"postUri\", position, \"imageCid\", alt) + SELECT post_uri, position, image_cid, alt + FROM _bulk_post_embed_image + ON CONFLICT DO NOTHING", + &[], + ) + .await?; + let insert_ms = insert_start.elapsed().as_millis(); + + // Log if total > 100ms (worth investigating) + let total_ms = setup_ms + copy_ms + insert_ms; + if total_ms > 100 { + tracing::warn!( + "SLOW post_embed_image bulk: {}ms total (setup={}ms, copy={}ms, insert={}ms) for {} rows", + total_ms, + setup_ms, + copy_ms, + insert_ms, + count + ); + } + + Ok(()) +} + +/// Bulk insert `post_embed_video` records using `COPY` protocol. +pub async fn copy_insert_post_embed_videos( + client: &deadpool_postgres::Client, + data: &[(String, String, Option)], // post_uri, video_cid, alt +) -> Result<(), WintermuteError> { + use std::time::Instant; + + if data.is_empty() { + return Ok(()); + } + + let count = data.len(); + + // Phase 1: Table setup + let setup_start = Instant::now(); + client + .execute( + "CREATE TEMP TABLE IF NOT EXISTS _bulk_post_embed_video ( + post_uri text NOT NULL, + video_cid text NOT NULL, + alt text + )", + &[], + ) + .await?; + + client + .execute("TRUNCATE _bulk_post_embed_video", &[]) + .await?; + let setup_ms = setup_start.elapsed().as_millis(); + + // Phase 2: COPY data (with NULL handling for alt) + let copy_start = Instant::now(); + let copy_stmt = client + .copy_in("COPY _bulk_post_embed_video (post_uri, video_cid, alt) FROM STDIN WITH (FORMAT text, DELIMITER E'\\t', NULL '\\N')") + .await?; + + let sink = copy_stmt; + pin_mut!(sink); + + let mut buffer = Vec::with_capacity(data.len() * 150); + for (post_uri, video_cid, alt) in data { + let escaped_alt = alt.as_ref().map_or_else( + || "\\N".to_owned(), // PostgreSQL NULL marker + |a| { + a.chars() + .map(|c| match c { + '\t' | '\n' | '\r' => ' ', + _ => c, + }) + .collect::() + }, + ); + writeln!(buffer, "{post_uri}\t{video_cid}\t{escaped_alt}") + .map_err(|e| WintermuteError::Other(format!("buffer write error: {e}")))?; + } + + sink.send(bytes::Bytes::from(buffer)).await?; + sink.close().await?; + let copy_ms = copy_start.elapsed().as_millis(); + + // Phase 3: INSERT...ON CONFLICT + let insert_start = Instant::now(); + client + .execute( + "INSERT INTO post_embed_video (\"postUri\", \"videoCid\", alt) + SELECT post_uri, video_cid, alt + FROM _bulk_post_embed_video + ON CONFLICT DO NOTHING", + &[], + ) + .await?; + let insert_ms = insert_start.elapsed().as_millis(); + + // Log if total > 100ms (worth investigating) + let total_ms = setup_ms + copy_ms + insert_ms; + if total_ms > 100 { + tracing::warn!( + "SLOW post_embed_video bulk: {}ms total (setup={}ms, copy={}ms, insert={}ms) for {} rows", + total_ms, + setup_ms, + copy_ms, + insert_ms, + count + ); + } + + Ok(()) +} + #[cfg(test)] mod tests { #[test] diff --git a/rsky-wintermute/src/indexer/mod.rs b/rsky-wintermute/src/indexer/mod.rs index 8c384584..e206e7bd 100644 --- a/rsky-wintermute/src/indexer/mod.rs +++ b/rsky-wintermute/src/indexer/mod.rs @@ -1315,7 +1315,7 @@ impl IndexerManager { ) .await?; } - "app.bsky.verification.proof" => { + "app.bsky.graph.verification" => { Self::index_verification( &client, did.as_str(), @@ -1326,6 +1326,16 @@ impl IndexerManager { ) .await?; } + "community.blacksky.feed.post" => { + Self::index_community_post_stub( + &client, + did.as_str(), + rkey.as_str(), + &job.cid, + &job.indexed_at, + ) + .await?; + } _ => {} } } @@ -1389,9 +1399,12 @@ impl IndexerManager { "app.bsky.actor.status" => { Self::delete_status(&client, did.as_str(), rkey.as_str()).await?; } - "app.bsky.verification.proof" => { + "app.bsky.graph.verification" => { Self::delete_verification(&client, did.as_str(), rkey.as_str()).await?; } + "community.blacksky.feed.post" => { + Self::delete_community_post(&client, did.as_str(), rkey.as_str()).await?; + } _ => {} } } @@ -1842,9 +1855,12 @@ impl IndexerManager { "app.bsky.actor.status" => { Self::index_status(client, did, rkey, record, cid, indexed_at).await } - "app.bsky.verification.proof" => { + "app.bsky.graph.verification" => { Self::index_verification(client, did, rkey, record, cid, indexed_at).await } + "community.blacksky.feed.post" => { + Self::index_community_post_stub(client, did, rkey, cid, indexed_at).await + } _ => Ok(()), } } @@ -2194,6 +2210,8 @@ impl IndexerManager { let mut creators: Vec = Vec::with_capacity(jobs.len()); let mut display_names: Vec> = Vec::with_capacity(jobs.len()); let mut descriptions: Vec> = Vec::with_capacity(jobs.len()); + let mut avatar_cids: Vec> = Vec::with_capacity(jobs.len()); + let mut banner_cids: Vec> = Vec::with_capacity(jobs.len()); let mut indexed_ats: Vec = Vec::with_capacity(jobs.len()); for pj in jobs { @@ -2201,12 +2219,26 @@ impl IndexerManager { let uri = pj.uri.to_string(); let display_name = sanitize_opt(record.get("displayName").and_then(|v| v.as_str())); let description = sanitize_opt(record.get("description").and_then(|v| v.as_str())); + let avatar_cid = record + .get("avatar") + .and_then(|v| v.get("ref")) + .and_then(|v| v.get("$link")) + .and_then(|v| v.as_str()) + .map(String::from); + let banner_cid = record + .get("banner") + .and_then(|v| v.get("ref")) + .and_then(|v| v.get("$link")) + .and_then(|v| v.as_str()) + .map(String::from); uris.push(uri); cids.push(pj.job.cid.clone()); creators.push(pj.did.clone()); display_names.push(display_name); descriptions.push(description); + avatar_cids.push(avatar_cid); + banner_cids.push(banner_cid); indexed_ats.push(pj.job.indexed_at.clone()); metrics::INDEXER_PROFILE_EVENTS_TOTAL.inc(); @@ -2215,10 +2247,16 @@ impl IndexerManager { client .execute( - "INSERT INTO profile (uri, cid, creator, \"displayName\", description, \"indexedAt\") - SELECT * FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[]) - ON CONFLICT DO NOTHING", - &[&uris, &cids, &creators, &display_names, &descriptions, &indexed_ats], + "INSERT INTO profile (uri, cid, creator, \"displayName\", description, \"avatarCid\", \"bannerCid\", \"indexedAt\") + SELECT * FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::text[], $6::text[], $7::text[], $8::text[]) + ON CONFLICT (uri) DO UPDATE SET + cid = EXCLUDED.cid, + \"displayName\" = EXCLUDED.\"displayName\", + description = EXCLUDED.description, + \"avatarCid\" = EXCLUDED.\"avatarCid\", + \"bannerCid\" = EXCLUDED.\"bannerCid\", + \"indexedAt\" = EXCLUDED.\"indexedAt\"", + &[&uris, &cids, &creators, &display_names, &descriptions, &avatar_cids, &banner_cids, &indexed_ats], ) .await?; @@ -2352,6 +2390,8 @@ impl IndexerManager { Vec::with_capacity(jobs.len()); let mut feed_item_data: Vec<(String, String, String, String, String, String)> = Vec::with_capacity(jobs.len()); + let mut embed_image_data: Vec<(String, String, String, String)> = Vec::new(); + let mut embed_video_data: Vec<(String, String, Option)> = Vec::new(); for pj in jobs { if let Some(record) = &pj.job.record { @@ -2381,21 +2421,115 @@ impl IndexerManager { "post".to_owned(), uri.clone(), pj.job.cid.clone(), - uri, + uri.clone(), pj.did.clone(), sort_at, )); + // Extract embed data for images and videos + if let Some(embed) = record.get("embed") { + Self::extract_embed_data( + embed, + &uri, + &mut embed_image_data, + &mut embed_video_data, + ); + } + metrics::INDEXER_POST_EVENTS_TOTAL.inc(); } } bulk::copy_insert_posts(client, &post_data).await?; bulk::copy_insert_feed_items(client, &feed_item_data).await?; + bulk::copy_insert_post_embed_images(client, &embed_image_data).await?; + bulk::copy_insert_post_embed_videos(client, &embed_video_data).await?; Ok(()) } + /// Extract embed data (images and videos) from a post's embed field + fn extract_embed_data( + embed: &serde_json::Value, + post_uri: &str, + embed_image_data: &mut Vec<(String, String, String, String)>, + embed_video_data: &mut Vec<(String, String, Option)>, + ) { + let embed_type = embed.get("$type").and_then(|t| t.as_str()).unwrap_or(""); + + // Handle app.bsky.embed.images + if embed_type == "app.bsky.embed.images" { + Self::extract_images(embed, post_uri, embed_image_data); + } + + // Handle app.bsky.embed.video + if embed_type == "app.bsky.embed.video" { + Self::extract_video(embed, post_uri, embed_video_data); + } + + // Handle app.bsky.embed.recordWithMedia (has nested media) + if embed_type == "app.bsky.embed.recordWithMedia" { + if let Some(media) = embed.get("media") { + let media_type = media.get("$type").and_then(|t| t.as_str()).unwrap_or(""); + if media_type == "app.bsky.embed.images" { + Self::extract_images(media, post_uri, embed_image_data); + } else if media_type == "app.bsky.embed.video" { + Self::extract_video(media, post_uri, embed_video_data); + } + } + } + } + + fn extract_images( + embed: &serde_json::Value, + post_uri: &str, + embed_image_data: &mut Vec<(String, String, String, String)>, + ) { + if let Some(images) = embed.get("images").and_then(|i| i.as_array()) { + for (position, image) in images.iter().enumerate() { + // Get the image CID - can be in image.ref.$link (CBOR decoded) or image.ref (string) + let image_cid = image.get("image").and_then(|img| { + img.get("ref") + .and_then(|r| r.get("$link").and_then(|l| l.as_str())) + .or_else(|| img.get("ref").and_then(|r| r.as_str())) + }); + + let alt = image.get("alt").and_then(|a| a.as_str()).unwrap_or(""); + + if let Some(image_cid) = image_cid { + embed_image_data.push(( + post_uri.to_owned(), + position.to_string(), + image_cid.to_owned(), + alt.to_owned(), + )); + } + } + } + } + + fn extract_video( + embed: &serde_json::Value, + post_uri: &str, + embed_video_data: &mut Vec<(String, String, Option)>, + ) { + // Get the video CID - can be in video.ref.$link (CBOR decoded) or video.ref (string) + let video_cid = embed.get("video").and_then(|vid| { + vid.get("ref") + .and_then(|r| r.get("$link").and_then(|l| l.as_str())) + .or_else(|| vid.get("ref").and_then(|r| r.as_str())) + }); + + let alt = embed + .get("alt") + .and_then(|a| a.as_str()) + .map(ToOwned::to_owned); + + if let Some(video_cid) = video_cid { + embed_video_data.push((post_uri.to_owned(), video_cid.to_owned(), alt)); + } + } + async fn copy_batch_insert_likes( client: &deadpool_postgres::Client, jobs: &[&ParsedJob<'_>], @@ -2598,32 +2732,50 @@ impl IndexerManager { // Process each label in the event for label in &label_event.labels { - // Insert or update the label - // Note: Using empty string for cid since label messages don't include it - // The primary key is (src, uri, cid, val), so we use "" as cid + let cid = label.cid.as_deref().unwrap_or(""); + let exp: Option<&str> = label.exp.as_deref(); + let result = client .execute( - "INSERT INTO label (src, uri, cid, val, cts, neg) - VALUES ($1, $2, $3, $4, $5, false) + "INSERT INTO label (src, uri, cid, val, neg, cts, exp) + VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (src, uri, cid, val) DO UPDATE SET - cts = EXCLUDED.cts", - &[&label.src, &label.uri, &"", &label.val, &label.cts], + neg = EXCLUDED.neg, + cts = EXCLUDED.cts, + exp = EXCLUDED.exp", + &[ + &label.src, &label.uri, &cid, &label.val, &label.neg, &label.cts, &exp, + ], ) .await; match result { Ok(_) => { - tracing::debug!( - "indexed label: src={} uri={} val={}", + if label.neg { + tracing::info!( + "negated label: src={} uri={} val={}", + label.src, + label.uri, + label.val + ); + } else { + tracing::debug!( + "indexed label: src={} uri={} val={}", + label.src, + label.uri, + label.val + ); + } + } + Err(e) => { + tracing::error!( + "failed to insert label: src={} uri={} val={} neg={}: {e}", label.src, label.uri, - label.val + label.val, + label.neg ); - } - Err(e) => { - tracing::error!("failed to insert label: {e}"); metrics::INDEXER_RECORDS_FAILED_TOTAL.inc(); - // Continue processing other labels even if one fails } } } @@ -2701,19 +2853,169 @@ impl IndexerManager { ) .await?; - // Generate reply notification for parent post author + // Generate mention notifications from facets + if let Some(facets) = record.get("facets").and_then(|f| f.as_array()) { + for facet in facets { + if let Some(features) = facet.get("features").and_then(|f| f.as_array()) { + for feature in features { + let feature_type = + feature.get("$type").and_then(|t| t.as_str()).unwrap_or(""); + if feature_type == "app.bsky.richtext.facet#mention" { + if let Some(mention_did) = feature.get("did").and_then(|d| d.as_str()) { + if mention_did == did { + tracing::debug!("skipping self-mention for {}", did); + } else { + tracing::info!( + "inserting mention notification: recipient={}, author={}, uri={}", + mention_did, + did, + uri + ); + match client + .execute( + "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"sortAt\") + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", + &[ + &mention_did, &did, &uri, &cid, &"mention", + &sort_at, + ], + ) + .await + { + Ok(rows) => tracing::info!( + "mention notification result: rows_affected={}, recipient={}, uri={}", + rows, + mention_did, + uri + ), + Err(e) => tracing::warn!("failed to insert mention notification for {uri}: {e}"), + } + } + } + } + } + } + } + } else { + tracing::debug!("no facets found for post {}", uri); + } + + // Reply notifications: walk ancestor chain up to REPLY_NOTIF_DEPTH + // Matches official Bluesky behavior from post.ts notifsForInsert if let Some(parent_uri_str) = reply_parent { - if let Ok(parent_uri) = AtUri::new(parent_uri_str.to_owned(), None) { - let parent_author = parent_uri.get_hostname(); - if parent_author != did { - client - .execute( - "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") - VALUES ($1, $2, $3, $4, $5, $6, $7) - ON CONFLICT DO NOTHING", - &[&parent_author, &did, &uri, &cid, &"reply", &Some(parent_uri_str), &sort_at], - ) - .await?; + const REPLY_NOTIF_DEPTH: i32 = 5; + + // Query ancestors using recursive CTE + // Height 0 = self, 1 = parent, 2 = grandparent, etc. + let ancestors = client + .query( + "WITH RECURSIVE ancestor(uri, ancestor_uri, height) AS ( + SELECT p.uri, p.\"replyParent\", 0 + FROM post p + WHERE p.uri = $1 + UNION ALL + SELECT p.uri, p.\"replyParent\", a.height + 1 + FROM post p + INNER JOIN ancestor a ON a.ancestor_uri = p.uri + WHERE a.height < $2 + ) + SELECT uri, height FROM ancestor", + &[&uri, &REPLY_NOTIF_DEPTH], + ) + .await?; + + // Notify each ancestor author (skip self at height 0) + for row in &ancestors { + let height: i32 = row.get(1); + if height == 0 || height >= REPLY_NOTIF_DEPTH { + continue; + } + let ancestor_uri_str: String = row.get(0); + if let Ok(ancestor_uri) = AtUri::new(ancestor_uri_str.clone(), None) { + let ancestor_author = ancestor_uri.get_hostname(); + if ancestor_author != did { + if let Err(e) = client + .execute( + "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") + VALUES ($1, $2, $3, $4, $5, $6, $7) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", + &[ + &ancestor_author, + &did, + &uri, + &cid, + &"reply", + &ancestor_uri_str, + &sort_at, + ], + ) + .await + { + tracing::warn!("failed to insert reply notification for {uri}: {e}"); + } + } + } + } + + // Descendant notifications for out-of-order indexing + // When a post in the middle of a thread is indexed after its replies, + // we notify ancestors about existing descendant replies + let descendants = client + .query( + "WITH RECURSIVE descendent(uri, depth) AS ( + SELECT p.uri, 1 + FROM post p + WHERE p.\"replyParent\" = $1 AND 1 <= $2 + UNION ALL + SELECT p.uri, d.depth + 1 + FROM post p + INNER JOIN descendent d ON d.uri = p.\"replyParent\" + WHERE d.depth < $2 + ) + SELECT d.uri, d.depth, p.cid, p.creator, p.\"sortAt\" + FROM descendent d + INNER JOIN post p ON p.uri = d.uri", + &[&uri, &REPLY_NOTIF_DEPTH], + ) + .await?; + + for desc_row in &descendants { + let desc_uri: String = desc_row.get(0); + let desc_depth: i32 = desc_row.get(1); + let desc_cid: String = desc_row.get(2); + let desc_creator: String = desc_row.get(3); + let desc_sort_at: String = desc_row.get(4); + + for anc_row in &ancestors { + let anc_height: i32 = anc_row.get(1); + if desc_depth + anc_height < REPLY_NOTIF_DEPTH { + let anc_uri: String = anc_row.get(0); + if let Ok(anc_uri_parsed) = AtUri::new(anc_uri.clone(), None) { + let anc_author = anc_uri_parsed.get_hostname(); + if anc_author != &desc_creator { + if let Err(e) = client + .execute( + "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") + VALUES ($1, $2, $3, $4, $5, $6, $7) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", + &[ + &anc_author, + &desc_creator, + &desc_uri, + &desc_cid, + &"reply", + &anc_uri, + &desc_sort_at, + ], + ) + .await + { + tracing::warn!("failed to insert reply notification for {desc_uri}: {e}"); + } + } + } + } } } @@ -2747,25 +3049,111 @@ impl IndexerManager { created_at: &str, indexed_at: &str, ) -> Result<(), WintermuteError> { + let embed_type = embed.get("$type").and_then(|t| t.as_str()).unwrap_or(""); + + // Handle app.bsky.embed.images + if embed_type == "app.bsky.embed.images" { + Self::handle_embed_images(client, embed, post_uri).await?; + } + + // Handle app.bsky.embed.video + if embed_type == "app.bsky.embed.video" { + Self::handle_embed_video(client, embed, post_uri).await?; + } + // Handle app.bsky.embed.record (quote post) - if let Some(record) = embed.get("record") { - Self::handle_embed_record( - client, record, post_uri, post_cid, creator, created_at, indexed_at, - ) - .await?; + if embed_type == "app.bsky.embed.record" { + if let Some(record) = embed.get("record") { + Self::handle_embed_record( + client, record, post_uri, post_cid, creator, created_at, indexed_at, + ) + .await?; + } } // Handle app.bsky.embed.recordWithMedia (quote post with media) - if let Some(record) = embed.get("record").and_then(|r| r.get("record")) { - Self::handle_embed_record( - client, record, post_uri, post_cid, creator, created_at, indexed_at, - ) - .await?; + if embed_type == "app.bsky.embed.recordWithMedia" { + // Handle the record part (quote) + if let Some(record) = embed.get("record").and_then(|r| r.get("record")) { + Self::handle_embed_record( + client, record, post_uri, post_cid, creator, created_at, indexed_at, + ) + .await?; + } + + // Handle the media part (images or video) + if let Some(media) = embed.get("media") { + let media_type = media.get("$type").and_then(|t| t.as_str()).unwrap_or(""); + if media_type == "app.bsky.embed.images" { + Self::handle_embed_images(client, media, post_uri).await?; + } else if media_type == "app.bsky.embed.video" { + Self::handle_embed_video(client, media, post_uri).await?; + } + } } Ok(()) } + async fn handle_embed_images( + client: &deadpool_postgres::Client, + embed: &serde_json::Value, + post_uri: &str, + ) -> Result<(), WintermuteError> { + if let Some(images) = embed.get("images").and_then(|i| i.as_array()) { + for (position, image) in images.iter().enumerate() { + // Get the image CID - can be in image.ref.$link (CBOR decoded) or image.ref (string) + let image_cid = image.get("image").and_then(|img| { + img.get("ref") + .and_then(|r| r.get("$link").and_then(|l| l.as_str())) + .or_else(|| img.get("ref").and_then(|r| r.as_str())) + }); + + let alt = image.get("alt").and_then(|a| a.as_str()).unwrap_or(""); + + if let Some(image_cid) = image_cid { + let position_str = position.to_string(); + client + .execute( + "INSERT INTO post_embed_image (\"postUri\", position, \"imageCid\", alt) + VALUES ($1, $2, $3, $4) + ON CONFLICT DO NOTHING", + &[&post_uri, &position_str, &image_cid, &alt], + ) + .await?; + } + } + } + Ok(()) + } + + async fn handle_embed_video( + client: &deadpool_postgres::Client, + embed: &serde_json::Value, + post_uri: &str, + ) -> Result<(), WintermuteError> { + // Get the video CID - can be in video.ref.$link (CBOR decoded) or video.ref (string) + let video_cid = embed.get("video").and_then(|vid| { + vid.get("ref") + .and_then(|r| r.get("$link").and_then(|l| l.as_str())) + .or_else(|| vid.get("ref").and_then(|r| r.as_str())) + }); + + let alt = embed.get("alt").and_then(|a| a.as_str()); + + if let Some(video_cid) = video_cid { + client + .execute( + "INSERT INTO post_embed_video (\"postUri\", \"videoCid\", alt) + VALUES ($1, $2, $3) + ON CONFLICT DO NOTHING", + &[&post_uri, &video_cid, &alt], + ) + .await?; + } + Ok(()) + } + async fn handle_embed_record( client: &deadpool_postgres::Client, record: &serde_json::Value, @@ -2781,13 +3169,19 @@ impl IndexerManager { if let (Some(embed_uri), Some(embed_cid)) = (embed_uri, embed_cid) { // Only process if it's a post being quoted if embed_uri.contains("/app.bsky.feed.post/") { + // Calculate sortAt (earlier of indexed_at and created_at) + let sort_at_quote = if indexed_at < created_at { + indexed_at + } else { + created_at + }; // Insert into quote table client .execute( - "INSERT INTO quote (uri, cid, creator, subject, \"subjectCid\", \"createdAt\", \"indexedAt\") - VALUES ($1, $2, $3, $4, $5, $6, $7) + "INSERT INTO quote (uri, cid, creator, subject, \"subjectCid\", \"createdAt\", \"indexedAt\", \"sortAt\") + VALUES ($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT DO NOTHING", - &[&post_uri, &post_cid, &creator, &embed_uri, &embed_cid, &created_at, &indexed_at], + &[&post_uri, &post_cid, &creator, &embed_uri, &embed_cid, &created_at, &indexed_at, &sort_at_quote], ) .await?; @@ -2810,14 +3204,17 @@ impl IndexerManager { } else { created_at }; - client + if let Err(e) = client .execute( "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") VALUES ($1, $2, $3, $4, $5, $6, $7) - ON CONFLICT DO NOTHING", + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", &["ed_author, &creator, &post_uri, &post_cid, &"quote", &Some(embed_uri), &sort_at], ) - .await?; + .await + { + tracing::warn!("failed to insert quote notification for {post_uri}: {e}"); + } } } } @@ -2843,6 +3240,50 @@ impl IndexerManager { Ok(()) } + /// Index a community post stub arriving from the firehose. + /// The full content was already stored via community.blacksky.feed.submitPost XRPC. + /// This just updates the CID on the existing `community_post` row. + async fn index_community_post_stub( + client: &deadpool_postgres::Client, + did: &str, + rkey: &str, + cid: &str, + indexed_at: &str, + ) -> Result<(), WintermuteError> { + let uri = format!("at://{did}/community.blacksky.feed.post/{rkey}"); + + let rows = client + .execute( + "UPDATE community_post SET cid = $1, \"indexedAt\" = $2 WHERE uri = $3", + &[&cid, &indexed_at, &uri], + ) + .await?; + + if rows == 0 { + tracing::debug!( + "community post stub for {} not found in community_post table (content not yet submitted)", + uri + ); + } else { + tracing::info!("updated community post stub cid for {}", uri); + } + + Ok(()) + } + + async fn delete_community_post( + client: &deadpool_postgres::Client, + did: &str, + rkey: &str, + ) -> Result<(), WintermuteError> { + let uri = format!("at://{did}/community.blacksky.feed.post/{rkey}"); + client + .execute("DELETE FROM community_post WHERE uri = $1", &[&uri]) + .await?; + tracing::info!("deleted community post {}", uri); + Ok(()) + } + async fn index_like( client: &deadpool_postgres::Client, did: &str, @@ -2882,13 +3323,17 @@ impl IndexerManager { if let Ok(subject_uri) = AtUri::new(subject.to_owned(), None) { let subject_author = subject_uri.get_hostname(); if subject_author != did { - client + if let Err(e) = client .execute( "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") - VALUES ($1, $2, $3, $4, $5, $6, $7)", + VALUES ($1, $2, $3, $4, $5, $6, $7) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", &[&subject_author, &did, &uri, &cid, &"like", &Some(subject), &indexed_at], ) - .await?; + .await + { + tracing::warn!("failed to insert like notification for {uri}: {e}"); + } } } } @@ -2953,13 +3398,17 @@ impl IndexerManager { .await?; if row_count > 0 { - client + if let Err(e) = client .execute( "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") - VALUES ($1, $2, $3, $4, $5, $6, $7)", + VALUES ($1, $2, $3, $4, $5, $6, $7) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", &[&subject, &did, &uri, &cid, &"follow", &None::, &indexed_at], ) - .await?; + .await + { + tracing::warn!("failed to insert follow notification for {uri}: {e}"); + } } client @@ -3053,13 +3502,17 @@ impl IndexerManager { if let Ok(subject_uri) = AtUri::new(subject.to_owned(), None) { let subject_author = subject_uri.get_hostname(); if subject_author != did { - client + if let Err(e) = client .execute( "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") - VALUES ($1, $2, $3, $4, $5, $6, $7)", + VALUES ($1, $2, $3, $4, $5, $6, $7) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", &[&subject_author, &did, &uri, &cid, &"repost", &Some(subject), &indexed_at], ) - .await?; + .await + { + tracing::warn!("failed to insert repost notification for {uri}: {e}"); + } } } } @@ -3121,7 +3574,7 @@ impl IndexerManager { .execute( "INSERT INTO actor_block (uri, cid, creator, \"subjectDid\", \"createdAt\", \"indexedAt\") VALUES ($1, $2, $3, $4, $5, $6) - ON CONFLICT ON CONSTRAINT actor_block_unique_subject DO NOTHING", + ON CONFLICT DO NOTHING", &[&uri, &cid, &did, &subject, &created_at, &indexed_at], ) .await?; @@ -3183,7 +3636,13 @@ impl IndexerManager { .execute( "INSERT INTO profile (uri, cid, creator, \"displayName\", description, \"avatarCid\", \"bannerCid\", \"joinedViaStarterPackUri\", \"createdAt\", \"indexedAt\") VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) - ON CONFLICT DO NOTHING", + ON CONFLICT (uri) DO UPDATE SET + cid = EXCLUDED.cid, + \"displayName\" = EXCLUDED.\"displayName\", + description = EXCLUDED.description, + \"avatarCid\" = EXCLUDED.\"avatarCid\", + \"bannerCid\" = EXCLUDED.\"bannerCid\", + \"indexedAt\" = EXCLUDED.\"indexedAt\"", &[&uri, &cid, &did, &display_name, &description, &avatar_cid, &banner_cid, &joined_via_uri, &created_at, &indexed_at], ) .await?; @@ -3192,13 +3651,17 @@ impl IndexerManager { if let Some(starter_pack_uri_str) = joined_via_uri { if let Ok(starter_pack_uri) = AtUri::new(starter_pack_uri_str.to_owned(), None) { let starter_pack_author = starter_pack_uri.get_hostname(); - client + if let Err(e) = client .execute( "INSERT INTO notification (did, author, \"recordUri\", \"recordCid\", reason, \"reasonSubject\", \"sortAt\") - VALUES ($1, $2, $3, $4, $5, $6, $7)", + VALUES ($1, $2, $3, $4, $5, $6, $7) + ON CONFLICT (did, \"recordUri\", reason) DO NOTHING", &[&starter_pack_author, &did, &uri, &cid, &"starterpack-joined", &Some(starter_pack_uri_str), &indexed_at], ) - .await?; + .await + { + tracing::warn!("failed to insert starterpack-joined notification for {uri}: {e}"); + } } } } @@ -3251,7 +3714,13 @@ impl IndexerManager { .execute( "INSERT INTO feed_generator (uri, cid, creator, \"feedDid\", \"displayName\", description, \"descriptionFacets\", \"avatarCid\", \"createdAt\", \"indexedAt\") VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) - ON CONFLICT DO NOTHING", + ON CONFLICT (uri) DO UPDATE SET + cid = EXCLUDED.cid, + \"displayName\" = EXCLUDED.\"displayName\", + description = EXCLUDED.description, + \"descriptionFacets\" = EXCLUDED.\"descriptionFacets\", + \"avatarCid\" = EXCLUDED.\"avatarCid\", + \"indexedAt\" = EXCLUDED.\"indexedAt\"", &[&uri, &cid, &did, &feed_did, &display_name, &description, &description_facets, &avatar_cid, &created_at, &indexed_at], ) .await?; @@ -3304,7 +3773,13 @@ impl IndexerManager { .execute( "INSERT INTO list (uri, cid, creator, name, purpose, description, \"descriptionFacets\", \"avatarCid\", \"createdAt\", \"indexedAt\") VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) - ON CONFLICT DO NOTHING", + ON CONFLICT (uri) DO UPDATE SET + cid = EXCLUDED.cid, + name = EXCLUDED.name, + description = EXCLUDED.description, + \"descriptionFacets\" = EXCLUDED.\"descriptionFacets\", + \"avatarCid\" = EXCLUDED.\"avatarCid\", + \"indexedAt\" = EXCLUDED.\"indexedAt\"", &[&uri, &cid, &did, &name, &purpose, &description, &description_facets, &avatar_cid, &created_at, &indexed_at], ) .await?; @@ -3444,7 +3919,10 @@ impl IndexerManager { .execute( "INSERT INTO starter_pack (uri, cid, creator, name, \"createdAt\", \"indexedAt\") VALUES ($1, $2, $3, $4, $5, $6) - ON CONFLICT DO NOTHING", + ON CONFLICT (uri) DO UPDATE SET + cid = EXCLUDED.cid, + name = EXCLUDED.name, + \"indexedAt\" = EXCLUDED.\"indexedAt\"", &[&uri, &cid, &did, &name, &created_at, &indexed_at], ) .await?; @@ -3727,7 +4205,7 @@ impl IndexerManager { indexed_at: &str, ) -> Result<(), WintermuteError> { let uri_obj = AtUri::new( - format!("at://{did}/app.bsky.verification.proof/{rkey}"), + format!("at://{did}/app.bsky.graph.verification/{rkey}"), None, ) .map_err(|e| WintermuteError::Other(format!("invalid uri: {e}")))?; @@ -3765,7 +4243,7 @@ impl IndexerManager { rkey: &str, ) -> Result<(), WintermuteError> { let uri_obj = AtUri::new( - format!("at://{did}/app.bsky.verification.proof/{rkey}"), + format!("at://{did}/app.bsky.graph.verification/{rkey}"), None, ) .map_err(|e| WintermuteError::Other(format!("invalid uri: {e}")))?; diff --git a/rsky-wintermute/src/indexer/tests.rs b/rsky-wintermute/src/indexer/tests.rs index 513814b6..7245a6db 100644 --- a/rsky-wintermute/src/indexer/tests.rs +++ b/rsky-wintermute/src/indexer/tests.rs @@ -492,8 +492,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }], }; @@ -514,7 +517,7 @@ mod indexer_tests { // Verify the label data let row = client .query_one( - "SELECT src, uri, val, cts FROM label WHERE src = $1 AND cid = ''", + "SELECT src, uri, val, cts, neg FROM label WHERE src = $1 AND cid = ''", &[&test_src], ) .await @@ -524,11 +527,13 @@ mod indexer_tests { let uri: String = row.get(1); let val: String = row.get(2); let cts: String = row.get(3); + let neg: bool = row.get(4); assert_eq!(src, test_src); assert_eq!(uri, test_uri); assert_eq!(val, "spam"); assert_eq!(cts, "2025-01-20T10:00:00Z"); + assert!(!neg, "expected neg to be false"); cleanup_test_labels(&pool, test_src).await; } @@ -546,20 +551,29 @@ mod indexer_tests { crate::types::Label { src: test_src.to_owned(), uri: "at://did:plc:user1/app.bsky.feed.post/post1".to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }, crate::types::Label { src: test_src.to_owned(), uri: "at://did:plc:user2/app.bsky.feed.post/post2".to_owned(), + cid: None, val: "nsfw".to_owned(), + neg: false, cts: "2025-01-20T10:01:00Z".to_owned(), + exp: None, }, crate::types::Label { src: test_src.to_owned(), uri: "at://did:plc:user3/app.bsky.feed.post/post3".to_owned(), + cid: None, val: "porn".to_owned(), + neg: false, cts: "2025-01-20T10:02:00Z".to_owned(), + exp: None, }, ], }; @@ -626,8 +640,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }], }; @@ -651,8 +668,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T11:00:00Z".to_owned(), // Different timestamp + exp: None, }], }; @@ -700,8 +720,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src1.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }], }; @@ -714,8 +737,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src2.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:01:00Z".to_owned(), + exp: None, }], }; @@ -754,8 +780,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }], }; @@ -768,8 +797,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src.to_owned(), uri: test_uri.to_owned(), + cid: None, val: "nsfw".to_owned(), + neg: false, cts: "2025-01-20T10:01:00Z".to_owned(), + exp: None, }], }; @@ -807,8 +839,11 @@ mod indexer_tests { labels: vec![crate::types::Label { src: test_src.to_owned(), uri: "at://did:plc:user/app.bsky.feed.post/roundtrip".to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }], }; @@ -845,6 +880,86 @@ mod indexer_tests { cleanup_test_labels(&pool, test_src).await; } + #[tokio::test] + async fn test_label_negation() { + let pool = setup_test_pool(); + let test_src = "did:plc:test_labeler_negation"; + let test_uri = "did:plc:user_negation_test"; + + cleanup_test_labels(&pool, test_src).await; + + // First: apply a takedown label + let label_event1 = crate::types::LabelEvent { + seq: 8000, + labels: vec![crate::types::Label { + src: test_src.to_owned(), + uri: test_uri.to_owned(), + cid: None, + val: "!takedown".to_owned(), + neg: false, + cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, + }], + }; + + let result = IndexerManager::process_label_event(&pool, &label_event1).await; + assert!(result.is_ok()); + + // Verify label exists with neg=false + let client = pool.get().await.unwrap(); + let row = client + .query_one( + "SELECT neg FROM label WHERE src = $1 AND uri = $2 AND cid = '' AND val = '!takedown'", + &[&test_src, &test_uri], + ) + .await + .unwrap(); + let neg: bool = row.get(0); + assert!(!neg, "initial label should have neg=false"); + + // Second: negate the takedown label + let label_event2 = crate::types::LabelEvent { + seq: 8001, + labels: vec![crate::types::Label { + src: test_src.to_owned(), + uri: test_uri.to_owned(), + cid: None, + val: "!takedown".to_owned(), + neg: true, + cts: "2025-01-20T11:00:00Z".to_owned(), + exp: None, + }], + }; + + let result = IndexerManager::process_label_event(&pool, &label_event2).await; + assert!(result.is_ok()); + + // Verify label now has neg=true (upserted, not duplicated) + let count: i64 = client + .query_one( + "SELECT COUNT(*) FROM label WHERE src = $1 AND uri = $2 AND cid = '' AND val = '!takedown'", + &[&test_src, &test_uri], + ) + .await + .unwrap() + .get(0); + assert_eq!(count, 1, "should still be 1 row after negation"); + + let row = client + .query_one( + "SELECT neg, cts FROM label WHERE src = $1 AND uri = $2 AND cid = '' AND val = '!takedown'", + &[&test_src, &test_uri], + ) + .await + .unwrap(); + let neg: bool = row.get(0); + let cts: String = row.get(1); + assert!(neg, "neg should be true after negation"); + assert_eq!(cts, "2025-01-20T11:00:00Z", "cts should be updated"); + + cleanup_test_labels(&pool, test_src).await; + } + #[tokio::test] async fn test_label_indexing_empty_labels_array() { let pool = setup_test_pool(); @@ -1065,6 +1180,8 @@ mod indexer_tests { ], blocks: vec![], }), + identity: None, + account: None, }; // Step 2: Simulate ingester processing event (enqueue to firehose_live) @@ -1591,7 +1708,7 @@ mod indexer_tests { // Test newer collection types that were previously untested let test_collections = vec![ ( - "app.bsky.verification.proof", + "app.bsky.graph.verification", json!({ "subject": "did:plc:verified", "handle": "verified.test", diff --git a/rsky-wintermute/src/ingester/labels.rs b/rsky-wintermute/src/ingester/labels.rs index 939f2e5e..f27088dd 100644 --- a/rsky-wintermute/src/ingester/labels.rs +++ b/rsky-wintermute/src/ingester/labels.rs @@ -235,11 +235,14 @@ pub fn parse_label_message(data: &[u8]) -> Result, Wintermute struct RawLabel { src: String, uri: String, - val: String, - #[allow(dead_code)] #[serde(default)] cid: Option, + val: String, + #[serde(default)] + neg: Option, cts: String, + #[serde(default)] + exp: Option, } let mut cursor = std::io::Cursor::new(data); @@ -262,8 +265,11 @@ pub fn parse_label_message(data: &[u8]) -> Result, Wintermute .map(|raw| Label { src: raw.src, uri: raw.uri, + cid: raw.cid, val: raw.val, + neg: raw.neg.unwrap_or(false), cts: raw.cts, + exp: raw.exp, }) .collect(); diff --git a/rsky-wintermute/src/ingester/mod.rs b/rsky-wintermute/src/ingester/mod.rs index 1842da76..3c04bf19 100644 --- a/rsky-wintermute/src/ingester/mod.rs +++ b/rsky-wintermute/src/ingester/mod.rs @@ -33,6 +33,7 @@ enum ConnectionResult { } #[derive(Debug)] +#[allow(clippy::large_enum_variant)] pub enum ParseResult { Event(FirehoseEvent), Skip, @@ -376,6 +377,84 @@ impl IngesterManager { .with_label_values(&["firehose_live"]) .inc(); + // Handle identity events separately (handle changes, key rotations) + if event.kind == "identity" { + let pool_clone = Arc::clone(pool); + let event_did = event.did.clone(); + let event_time = event.time.clone(); + let event_handle = event.identity.as_ref().and_then(|i| i.handle.clone()); + tokio::spawn(async move { + if let Err(e) = Self::process_identity_event( + &pool_clone, + &event_did, + &event_time, + event_handle.as_deref(), + ) + .await + { + tracing::error!( + "identity event processing failed for {}: {e}", + event_did + ); + metrics::INGESTER_ERRORS_TOTAL + .with_label_values(&["identity_failed"]) + .inc(); + } + }); + last_seq.store(event.seq, Ordering::Relaxed); + continue; + } + + // Handle account events (takedown, suspension, deletion, reactivation) + if event.kind == "account" { + if let Some(ref account) = event.account { + let pool_clone = Arc::clone(pool); + let event_did = event.did.clone(); + let active = account.active; + let status = account.status.clone(); + tokio::spawn(async move { + if let Err(e) = Self::process_account_event( + &pool_clone, + &event_did, + active, + status.as_deref(), + ) + .await + { + tracing::error!( + "account event processing failed for {}: {e}", + event_did + ); + metrics::INGESTER_ERRORS_TOTAL + .with_label_values(&["account_failed"]) + .inc(); + } + }); + } + last_seq.store(event.seq, Ordering::Relaxed); + continue; + } + + // Handle sync events (repo recovery - refresh handle like identity events) + if event.kind == "sync" { + let pool_clone = Arc::clone(pool); + let event_did = event.did.clone(); + let event_time = event.time.clone(); + tokio::spawn(async move { + if let Err(e) = + Self::process_identity_event(&pool_clone, &event_did, &event_time, None) + .await + { + tracing::error!("sync event processing failed for {}: {e}", event_did); + metrics::INGESTER_ERRORS_TOTAL + .with_label_values(&["sync_failed"]) + .inc(); + } + }); + last_seq.store(event.seq, Ordering::Relaxed); + continue; + } + // Process inline: parse event and spawn indexing tasks directly (skip Fjall queue) match Self::parse_event_to_jobs(&event).await { Ok(jobs) => { @@ -483,7 +562,73 @@ impl IngesterManager { return Ok(ParseResult::Skip); } - // Only process #commit messages + // Handle #identity events (handle changes, key rotations, etc) + if header.type_ == "#identity" { + let body: rsky_lexicon::com::atproto::sync::SubscribeReposIdentity = + serde_ipld_dagcbor::from_reader(&mut cursor).map_err(|e| { + WintermuteError::Serialization(format!("failed to parse identity body: {e}")) + })?; + + let event = FirehoseEvent { + seq: body.seq, + did: body.did, + time: body.time.to_rfc3339(), + kind: "identity".to_owned(), + commit: None, + identity: Some(crate::types::IdentityData { + handle: body.handle, + }), + account: None, + }; + + return Ok(ParseResult::Event(event)); + } + + // Handle #account events (takedown, suspension, deletion, etc.) + if header.type_ == "#account" { + let body: rsky_lexicon::com::atproto::sync::SubscribeReposAccount = + serde_ipld_dagcbor::from_reader(&mut cursor).map_err(|e| { + WintermuteError::Serialization(format!("failed to parse account body: {e}")) + })?; + + let event = FirehoseEvent { + seq: body.seq, + did: body.did, + time: body.time.to_rfc3339(), + kind: "account".to_owned(), + commit: None, + identity: None, + account: Some(crate::types::AccountData { + active: body.active, + status: body.status.map(|s| s.to_string().to_lowercase()), + }), + }; + + return Ok(ParseResult::Event(event)); + } + + // Handle #sync events (repo state recovery/updates) + if header.type_ == "#sync" { + let body: rsky_lexicon::com::atproto::sync::SubscribeReposSync = + serde_ipld_dagcbor::from_reader(&mut cursor).map_err(|e| { + WintermuteError::Serialization(format!("failed to parse sync body: {e}")) + })?; + + // Treat sync like identity - refresh the handle + let event = FirehoseEvent { + seq: body.seq, + did: body.did, + time: body.time.to_rfc3339(), + kind: "sync".to_owned(), + commit: None, + identity: None, + account: None, + }; + + return Ok(ParseResult::Event(event)); + } + + // Only process #commit messages beyond this point if header.type_ != "#commit" { return Ok(ParseResult::Skip); } @@ -515,6 +660,8 @@ impl IngesterManager { ops, blocks: body.blocks, }), + identity: None, + account: None, }; Ok(ParseResult::Event(event)) @@ -713,6 +860,149 @@ impl IngesterManager { Ok(()) } + + /// Process an identity event by resolving the DID and updating the actor table + async fn process_identity_event( + pool: &Pool, + did: &str, + timestamp: &str, + handle_hint: Option<&str>, + ) -> Result<(), WintermuteError> { + use rsky_identity::IdResolver; + use rsky_identity::types::IdentityResolverOpts; + + tracing::debug!("processing identity event for {}", did); + + // If the event includes the handle, we can use it directly + // Otherwise, resolve the DID to get the current handle from the DID document + let handle = if let Some(h) = handle_hint { + Some(h.to_lowercase()) + } else { + // Resolve DID to get current handle from DID document + let mut resolver = IdResolver::new(IdentityResolverOpts { + timeout: Some(std::time::Duration::from_secs(5)), + plc_url: None, + did_cache: None, + backup_nameservers: None, + }); + + match resolver.did.resolve(did.to_owned(), None).await { + Ok(Some(doc)) => { + // Extract handle from alsoKnownAs (at:// URIs) + let handle = doc.also_known_as.as_ref().and_then(|akas| { + akas.iter() + .find(|aka| aka.starts_with("at://")) + .map(|aka| aka.strip_prefix("at://").unwrap_or(aka).to_lowercase()) + }); + + if let Some(ref h) = handle { + // Verify handle resolves back to this DID + match resolver.handle.resolve(h).await { + Ok(Some(resolved_did)) if resolved_did == did => { + tracing::info!("identity event: verified handle {} for {}", h, did); + } + _ => { + tracing::debug!( + "handle {} does not resolve back to {} - setting handle to null", + h, + did + ); + return Ok(()); // Don't update if handle doesn't verify + } + } + } + + handle + } + Ok(None) => { + tracing::warn!("DID {} not found", did); + return Ok(()); + } + Err(e) => { + tracing::warn!("failed to resolve DID {}: {}", did, e); + return Ok(()); // Don't fail on resolution errors, just skip + } + } + }; + + // Update actor table + let client = pool.get().await?; + let result = client + .execute( + "UPDATE actor SET handle = $1, \"indexedAt\" = $2 WHERE did = $3", + &[&handle, ×tamp, &did], + ) + .await?; + + if result > 0 { + tracing::info!( + "updated handle for {} to {:?}", + did, + handle.as_deref().unwrap_or("null") + ); + } else { + tracing::debug!("no actor found to update for {}", did); + } + + Ok(()) + } + + /// Process an account event by updating the actor's upstream status + async fn process_account_event( + pool: &Pool, + did: &str, + active: bool, + status: Option<&str>, + ) -> Result<(), WintermuteError> { + tracing::debug!( + "processing account event for {}: active={}, status={:?}", + did, + active, + status + ); + + // Determine upstream_status based on active flag and status + let upstream_status: Option<&str> = if active { + // Active accounts have no upstream status + None + } else { + // Inactive accounts: check for recognized statuses + match status { + Some(s) if ["deactivated", "suspended", "takendown", "deleted"].contains(&s) => { + Some(s) + } + Some(s) => { + tracing::warn!("unrecognized account status '{}' for {}", s, did); + Some(s) // Still store it, just log a warning + } + None => { + tracing::warn!("inactive account {} has no status", did); + None + } + } + }; + + // Update actor table + let client = pool.get().await?; + let result = client + .execute( + "UPDATE actor SET \"upstreamStatus\" = $1 WHERE did = $2", + &[&upstream_status, &did], + ) + .await?; + + if result > 0 { + tracing::info!( + "updated upstream_status for {} to {:?}", + did, + upstream_status.unwrap_or("null") + ); + } else { + tracing::debug!("no actor found to update status for {}", did); + } + + Ok(()) + } } async fn get_cursor_from_postgres(pool: &Pool, service: &str) -> Result { diff --git a/rsky-wintermute/src/ingester/tests.rs b/rsky-wintermute/src/ingester/tests.rs index 5f1e0646..a2b707a6 100644 --- a/rsky-wintermute/src/ingester/tests.rs +++ b/rsky-wintermute/src/ingester/tests.rs @@ -263,6 +263,8 @@ mod ingester_tests { ops: vec![], blocks: vec![10, 20, 30], }), + identity: None, + account: None, }; storage.write_firehose_event(event.seq, &event).unwrap(); @@ -384,6 +386,8 @@ mod ingester_tests { ops: vec![], blocks: vec![], }), + identity: None, + account: None, }, FirehoseEvent { seq: 2, @@ -395,6 +399,8 @@ mod ingester_tests { ops: vec![], blocks: vec![], }), + identity: None, + account: None, }, FirehoseEvent { seq: 3, @@ -406,6 +412,8 @@ mod ingester_tests { ops: vec![], blocks: vec![], }), + identity: None, + account: None, }, ]; @@ -452,6 +460,19 @@ mod ingester_tests { fn create_label_message( seq: i64, labels: Vec<(&str, &str, &str, &str)>, // (src, uri, val, cts) + ) -> Vec { + create_label_message_with_neg( + seq, + labels + .into_iter() + .map(|(src, uri, val, cts)| (src, uri, val, cts, false)) + .collect(), + ) + } + + fn create_label_message_with_neg( + seq: i64, + labels: Vec<(&str, &str, &str, &str, bool)>, // (src, uri, val, cts, neg) ) -> Vec { #[derive(serde::Serialize)] struct Header { @@ -465,6 +486,8 @@ mod ingester_tests { uri: String, val: String, cts: String, + #[serde(skip_serializing_if = "std::ops::Not::not")] + neg: bool, } #[derive(serde::Serialize)] @@ -482,11 +505,12 @@ mod ingester_tests { seq, labels: labels .into_iter() - .map(|(src, uri, val, cts)| RawLabel { + .map(|(src, uri, val, cts, neg)| RawLabel { src: src.to_owned(), uri: uri.to_owned(), val: val.to_owned(), cts: cts.to_owned(), + neg, }) .collect(), }; @@ -521,6 +545,32 @@ mod ingester_tests { assert_eq!(label.uri, "at://did:plc:user456/app.bsky.feed.post/abc123"); assert_eq!(label.val, "spam"); assert_eq!(label.cts, "2025-01-20T10:30:00Z"); + assert!(!label.neg, "default neg should be false"); + } + + #[test] + fn test_parse_label_message_with_negation() { + let msg_bytes = create_label_message_with_neg( + 99999, + vec![( + "did:plc:ar7c4by46qjdydhdevvrndac", + "did:plc:user123", + "!takedown", + "2025-11-27T06:20:00Z", + true, + )], + ); + + let result = crate::ingester::labels::parse_label_message(&msg_bytes).unwrap(); + assert!(result.is_some()); + let label_event = result.unwrap(); + assert_eq!(label_event.labels.len(), 1); + + let label = &label_event.labels[0]; + assert_eq!(label.src, "did:plc:ar7c4by46qjdydhdevvrndac"); + assert_eq!(label.uri, "did:plc:user123"); + assert_eq!(label.val, "!takedown"); + assert!(label.neg, "neg should be true for negation labels"); } #[test] @@ -595,14 +645,20 @@ mod ingester_tests { crate::types::Label { src: "did:plc:labeler".to_owned(), uri: "at://did:plc:user/app.bsky.feed.post/abc".to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-20T10:00:00Z".to_owned(), + exp: None, }, crate::types::Label { src: "did:plc:labeler".to_owned(), uri: "at://did:plc:user/app.bsky.feed.post/def".to_owned(), + cid: None, val: "nsfw".to_owned(), + neg: false, cts: "2025-01-20T10:01:00Z".to_owned(), + exp: None, }, ], }; @@ -798,6 +854,8 @@ mod ingester_tests { ], blocks: vec![], }), + identity: None, + account: None, }; // Queue should be empty initially @@ -874,6 +932,8 @@ mod ingester_tests { ops: vec![], // No operations blocks: vec![], }), + identity: None, + account: None, }; // Should succeed but not enqueue anything @@ -895,6 +955,8 @@ mod ingester_tests { time: "2024-01-01T00:00:00Z".to_owned(), kind: "identity".to_owned(), commit: None, // No commit data + identity: None, + account: None, }; // Should succeed but not enqueue anything diff --git a/rsky-wintermute/src/storage.rs b/rsky-wintermute/src/storage.rs index a7555b52..fd381d43 100644 --- a/rsky-wintermute/src/storage.rs +++ b/rsky-wintermute/src/storage.rs @@ -205,6 +205,20 @@ impl Storage { Ok(()) } + /// Enqueue a backfill job with IMMEDIATE priority (timestamp 0) + /// These items are processed FIRST, before all other priority items + pub fn enqueue_backfill_immediate(&self, job: &BackfillJob) -> Result<(), WintermuteError> { + // Key format: "0:0:{did}" - timestamp 0 ensures it sorts first + let key = format!("0:0:{}", job.did); + let mut value = Vec::new(); + ciborium::into_writer(job, &mut value) + .map_err(|e| WintermuteError::Serialization(format!("failed to serialize job: {e}")))?; + self.repo_backfill + .insert(key.as_bytes(), value.as_slice())?; + crate::metrics::INGESTER_REPO_BACKFILL_LENGTH.inc(); + Ok(()) + } + pub fn dequeue_backfill(&self) -> Result, BackfillJob)>, WintermuteError> { let mut iter = self.repo_backfill.iter(); let Some(entry) = iter.next() else { @@ -808,6 +822,8 @@ mod tests { ops: vec![], blocks: vec![], }), + identity: None, + account: None, }; storage.write_firehose_event(12345, &event).unwrap(); @@ -1167,8 +1183,11 @@ mod tests { labels: vec![Label { src: "did:plc:labeler".to_owned(), uri: "at://did:plc:test/app.bsky.feed.post/123".to_owned(), + cid: None, val: "spam".to_owned(), + neg: false, cts: "2025-01-01T00:00:00Z".to_owned(), + exp: None, }], }; diff --git a/rsky-wintermute/src/types.rs b/rsky-wintermute/src/types.rs index 59110f75..6a6c4edd 100644 --- a/rsky-wintermute/src/types.rs +++ b/rsky-wintermute/src/types.rs @@ -51,6 +51,19 @@ pub struct FirehoseEvent { pub time: String, pub kind: String, pub commit: Option, + pub identity: Option, + pub account: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IdentityData { + pub handle: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AccountData { + pub active: bool, + pub status: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -102,6 +115,9 @@ pub struct LabelEvent { pub struct Label { pub src: String, pub uri: String, + pub cid: Option, pub val: String, + pub neg: bool, pub cts: String, + pub exp: Option, }