diff --git a/crates/integration-tests/Cargo.toml b/crates/integration-tests/Cargo.toml index 24d0cdcc..16ea420e 100644 --- a/crates/integration-tests/Cargo.toml +++ b/crates/integration-tests/Cargo.toml @@ -9,6 +9,11 @@ name = "integration" path = "tests/integration.rs" harness = true +[[test]] +name = "routing" +path = "tests/routing.rs" +harness = true + [dev-dependencies] testcontainers = { version = "0.25", features = ["blocking"] } reqwest = { version = "0.12", features = ["blocking"] } diff --git a/crates/integration-tests/tests/routing.rs b/crates/integration-tests/tests/routing.rs new file mode 100644 index 00000000..bdbff5fd --- /dev/null +++ b/crates/integration-tests/tests/routing.rs @@ -0,0 +1,226 @@ +#![allow(dead_code)] + +mod common; +mod environments; + +use common::runtime::RuntimeEnvironment as _; +use environments::fastly::FastlyViceroy; +use std::io::{Read as _, Write as _}; +use std::net::{TcpListener, TcpStream}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, OnceLock}; +use std::thread; + +/// In-process HTTP server that returns a fixed response body. +/// +/// Listens on a fixed port. Accepts connections in a background thread, +/// drains each request, and responds with `HTTP/1.1 200 OK` and the +/// configured body. Stopped on [`Drop`] via a shutdown flag + self-connect. +/// +/// Does not store the `JoinHandle` so that `MockOrigin` remains `Sync` +/// (required for placement in a `static OnceLock`). The thread exits +/// naturally when the process ends. +struct MockOrigin { + port: u16, + shutdown: Arc, +} + +impl MockOrigin { + /// Start a mock origin server on `port` that always responds with `body`. + /// + /// # Panics + /// + /// Panics if the port cannot be bound. + fn start(port: u16, body: &'static str) -> Self { + let listener = TcpListener::bind(format!("127.0.0.1:{port}")) + .unwrap_or_else(|e| panic!("should bind MockOrigin to port {port}: {e}")); + + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_clone = Arc::clone(&shutdown); + + thread::spawn(move || { + for stream in listener.incoming() { + if shutdown_clone.load(Ordering::Relaxed) { + break; + } + if let Ok(stream) = stream { + serve(stream, body); + } + } + }); + + MockOrigin { port, shutdown } + } +} + +impl Drop for MockOrigin { + fn drop(&mut self) { + self.shutdown.store(true, Ordering::Relaxed); + // Unblock the accept() call so the thread can observe the shutdown flag. + let _ = TcpStream::connect(format!("127.0.0.1:{}", self.port)); + } +} + +/// Write a minimal HTTP/1.1 200 response with `body` to `stream`. +/// +/// Drains the incoming request first so the client does not see a broken pipe. +fn serve(mut stream: TcpStream, body: &'static str) { + let mut buf = [0u8; 4096]; + let _ = stream.read(&mut buf); + let response = format!( + "HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: {len}\r\nConnection: close\r\n\r\n{body}", + len = body.len(), + ); + let _ = stream.write_all(response.as_bytes()); +} + +/// Shared test state: mock origins + Viceroy process + pre-configured reqwest client. +/// +/// Initialised once via [`get_harness`]. All five test functions share this +/// single instance to avoid the cost of spinning up Viceroy per test. +struct RoutingHarness { + _origins: Vec, + _process: common::runtime::RuntimeProcess, + /// Client with resolve overrides so `http://site-a.test/` connects to Viceroy + /// while sending the correct `Host` header. + client: reqwest::blocking::Client, +} + +static HARNESS: OnceLock> = OnceLock::new(); + +/// Return the shared harness, or `None` if `ROUTING_WASM_PATH` is not set. +/// +/// Returns `None` rather than panicking so that tests pass trivially when +/// invoked outside the routing-specific CI step (e.g. `cargo test --workspace`). +fn get_harness() -> Option<&'static RoutingHarness> { + HARNESS + .get_or_init(|| { + let wasm_path = std::env::var("ROUTING_WASM_PATH").ok()?; + + let origins = vec![ + MockOrigin::start(19090, "default"), + MockOrigin::start(19091, "site-a"), + MockOrigin::start(19092, "site-b"), + MockOrigin::start(19093, "api"), + ]; + + let process = FastlyViceroy + .spawn(std::path::Path::new(&wasm_path)) + .expect("should spawn Viceroy with routing WASM"); + + let viceroy_port: u16 = process + .base_url + .trim_start_matches("http://127.0.0.1:") + .parse() + .expect("should parse Viceroy port from base_url"); + + let viceroy_addr: std::net::SocketAddr = format!("127.0.0.1:{viceroy_port}") + .parse() + .expect("should parse Viceroy socket addr"); + + let client = reqwest::blocking::ClientBuilder::new() + .resolve("site-a.test", viceroy_addr) + .resolve("www.site-a.test", viceroy_addr) + .resolve("site-b.test", viceroy_addr) + .resolve("any.test", viceroy_addr) + .resolve("unknown.test", viceroy_addr) + .build() + .expect("should build reqwest client"); + + Some(RoutingHarness { + _origins: origins, + _process: process, + client, + }) + }) + .as_ref() +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[test] +fn domain_routes_to_site_a() { + let Some(h) = get_harness() else { return }; + + let body = h + .client + .get("http://site-a.test/") + .send() + .expect("should send request to site-a.test") + .text() + .expect("should read response body"); + + assert_eq!(body, "site-a", "should route site-a.test to the site-a backend"); +} + +#[test] +fn domain_routes_to_site_b() { + let Some(h) = get_harness() else { return }; + + let body = h + .client + .get("http://site-b.test/") + .send() + .expect("should send request to site-b.test") + .text() + .expect("should read response body"); + + assert_eq!(body, "site-b", "should route site-b.test to the site-b backend"); +} + +#[test] +fn www_prefix_stripped() { + let Some(h) = get_harness() else { return }; + + let body = h + .client + .get("http://www.site-a.test/") + .send() + .expect("should send request to www.site-a.test") + .text() + .expect("should read response body"); + + assert_eq!( + body, "site-a", + "should strip www. prefix and route to the site-a backend" + ); +} + +#[test] +fn path_routes_to_api() { + let Some(h) = get_harness() else { return }; + + // any.test has no domain entry — path pattern matching fires instead. + let body = h + .client + .get("http://any.test/.api/users") + .send() + .expect("should send request to any.test/.api/users") + .text() + .expect("should read response body"); + + assert_eq!( + body, "api", + "should route /.api/ path prefix to the api backend" + ); +} + +#[test] +fn unknown_host_falls_back_to_default() { + let Some(h) = get_harness() else { return }; + + let body = h + .client + .get("http://unknown.test/") + .send() + .expect("should send request to unknown.test") + .text() + .expect("should read response body"); + + assert_eq!( + body, "default", + "should fall back to publisher.origin_url for unmatched hosts" + ); +} diff --git a/crates/trusted-server-adapter-fastly/backends.toml b/crates/trusted-server-adapter-fastly/backends.toml new file mode 100644 index 00000000..fbb04045 --- /dev/null +++ b/crates/trusted-server-adapter-fastly/backends.toml @@ -0,0 +1,176 @@ +# Arena Group / SayMedia backend routing configuration. +# +# This file is merged into the embedded binary config at build time by +# crates/trusted-server-core/build.rs. It is separate from trusted-server.toml to keep +# customer-specific configuration out of the shared application template. + +[[backends]] +id = "raven" +origin_url = "https://raven-public.prod.saymedia.com" +certificate_check = true +domains = [ + "active.com", + "americansongwriter.com", + "athleticbusiness.com", + "athlonsports.com", + "autoblog.com", + "azbigmedia.com", + "benzinga.com", + "bestproducts.com", + "bicycling.com", + "biography.com", + "bizjournals.com", + "bleacherreport.com", + "blogher.com", + "carsdirect.com", + "catalog.thearenagroup.net", + "cbsnews.com", + "cheatsheet.com", + "chron.com", + "cinemablend.com", + "citybeatnews.com", + "coachmag.co.uk", + "coastalliving.com", + "collegehumor.com", + "countryliving.com", + "ctnewsonline.com", + "dailypress.com", + "delish.com", + "denofgeek.com", + "detroit.cbslocal.com", + "digg.com", + "digitalspy.com", + "diynetwork.com", + "dooyoo.co.uk", + "dualshockers.com", + "eater.com", + "elle.com", + "elledecor.com", + "esquire.com", + "eurweb.com", + "everydayhealth.com", + "fansided.com", + "fightful.com", + "filmschoolrejects.com", + "fitbit.com", + "foodandwine.com", + "fool.com", + "forbes.com", + "freep.com", + "gamerant.com", + "gizmodo.com", + "glam.com", + "goodhousekeeping.com", + "grunge.com", + "health.com", + "healthline.com", + "hercampus.com", + "hgtv.com", + "history.com", + "hollywoodreporter.com", + "housebeautiful.com", + "huffpost.com", + "ibtimes.com", + "ign.com", + "indiewire.com", + "insidehook.com", + "instyle.com", + "investopedia.com", + "io9.com", + "jezebel.com", + "kiplinger.com", + "kotaku.com", + "latimes.com", + "law.com", + "lifehacker.com", + "livestrong.com", + "livescience.com", + "localiq.com", + "looper.com", + "mashable.com", + "mayoclinic.org", + "medicalnewstoday.com", + "menshealth.com", + "mensjournal.com", + "meredith.com", + "metro.co.uk", + "military.com", + "militarytimes.com", + "mlb.com", + "mlive.com", + "mnn.com", + "motorcyclistonline.com", + "msn.com", + "narcity.com", + "nationalreview.com", + "nbcnews.com", + "nerdist.com", + "newsweek.com", + "nj.com", + "nola.com", + "npr.org", + "nypost.com", + "nytimes.com", + "observer.com", + "oregonlive.com", + "outsideonline.com", + "outsports.com", + "oxygen.com", + "parade.com", + "patch.com", + "pcgamer.com", + "pennlive.com", + "people.com", + "petmd.com", + "pgalinks.org", + "philly.com", + "polygon.com", + "popsugar.com", + "prevention.com", + "purewow.com", + "realclearpolitics.com", + "realsimple.com", + "realtor.com", + "refinery29.com", + "rollingstone.com", + "runnersworld.com", + "salon.com", + "scout.com", + "screenrant.com", + "sfgate.com", + "si.com", + "simplemost.com", + "slate.com", + "space.com", + "sporcle.com", + "sportingnews.com", + "sportskeeda.com", + "southernliving.com", + "stltoday.com", + "syracuse.com", + "tampabay.com", + "thedailybeast.com", + "thedailymeal.com", + "thedenverchannel.com", + "thegamer.com", + "thelist.com", + "thepioneerwoman.com", + "thestreet.com", + "thethings.com", + "time.com", + "tmz.com", + "today.com", + "townandcountrymag.com", + "travelandleisure.com", + "usatoday.com", + "variety.com", + "verywellhealth.com", + "vox.com", + "vulture.com", + "washingtonpost.com", + "webmd.com", + "wired.com", + "womansday.com", + "womenshealthmag.com", + "yahoo.com", +] diff --git a/crates/trusted-server-adapter-fastly/test-backends.toml b/crates/trusted-server-adapter-fastly/test-backends.toml new file mode 100644 index 00000000..5485f552 --- /dev/null +++ b/crates/trusted-server-adapter-fastly/test-backends.toml @@ -0,0 +1,21 @@ +# Test-only backend routing config. +# Embedded at compile time when ROUTING_TEST_BACKENDS=1 is set. +# Ports 19090-19093 are used by MockOrigin servers in tests/routing.rs. + +[[backends]] +id = "site-a" +origin_url = "http://127.0.0.1:19091" +domains = ["site-a.test", "www.site-a.test"] + +[[backends]] +id = "site-b" +origin_url = "http://127.0.0.1:19092" +domains = ["site-b.test"] + +[[backends]] +id = "api" +origin_url = "http://127.0.0.1:19093" + + [[backends.path_patterns]] + host = "*" + path_prefix = "/.api/" diff --git a/crates/trusted-server-core/build.rs b/crates/trusted-server-core/build.rs index b2803135..b1cc467d 100644 --- a/crates/trusted-server-core/build.rs +++ b/crates/trusted-server-core/build.rs @@ -18,19 +18,46 @@ mod consent_config; #[path = "src/settings.rs"] mod settings; +use serde_json::Value; +use std::collections::HashSet; use std::fs; use std::path::Path; const TRUSTED_SERVER_INIT_CONFIG_PATH: &str = "../../trusted-server.toml"; const TRUSTED_SERVER_OUTPUT_CONFIG_PATH: &str = "../../target/trusted-server-out.toml"; +const BACKENDS_CONFIG_PATH: &str = "../../crates/trusted-server-adapter-fastly/backends.toml"; +const TEST_BACKENDS_CONFIG_PATH: &str = + "../../crates/trusted-server-adapter-fastly/test-backends.toml"; fn main() { - // Always rerun build.rs: integration settings are stored in a flat - // HashMap, so we cannot enumerate all possible env - // var keys ahead of time. Emitting rerun-if-changed for a nonexistent - // file forces cargo to always rerun the build script. - println!("cargo:rerun-if-changed=_always_rebuild_sentinel_"); + merge_toml(); + rerun_if_changed(); +} + +fn rerun_if_changed() { + // Watch the root trusted-server.toml file for changes + println!("cargo:rerun-if-changed={}", TRUSTED_SERVER_INIT_CONFIG_PATH); + println!("cargo:rerun-if-changed={}", BACKENDS_CONFIG_PATH); + println!("cargo:rerun-if-changed={}", TEST_BACKENDS_CONFIG_PATH); + println!("cargo:rerun-if-env-changed=ROUTING_TEST_BACKENDS"); + + // Create a default Settings instance and convert to JSON to discover all fields + let default_settings = settings::Settings::default(); + let settings_json = serde_json::to_value(&default_settings).unwrap(); + + let mut env_vars = HashSet::new(); + collect_env_vars(&settings_json, &mut env_vars, &[]); + + // Print rerun-if-env-changed for each variable + let mut sorted_vars: Vec<_> = env_vars.into_iter().collect(); + sorted_vars.sort(); + for var in sorted_vars { + println!("cargo:rerun-if-env-changed={}", var); + } +} + +fn merge_toml() { // Read init config let init_config_path = Path::new(TRUSTED_SERVER_INIT_CONFIG_PATH); let toml_content = fs::read_to_string(init_config_path) @@ -43,13 +70,30 @@ fn main() { // production deployments override via TRUSTED_SERVER__* env vars at // build time. Runtime startup (get_settings) rejects any remaining // placeholders so a misconfigured deployment fails fast. - let settings = settings::Settings::from_toml_and_env(&toml_content) + let mut settings = settings::Settings::from_toml_and_env(&toml_content) .expect("Failed to parse settings at build time"); - let merged_toml = - toml::to_string_pretty(&settings).expect("Failed to serialize settings to TOML"); + // Merge customer-specific backends from crates/fastly/backends.toml, if present + let backends_path = if std::env::var("ROUTING_TEST_BACKENDS").is_ok() { + Path::new(TEST_BACKENDS_CONFIG_PATH) + } else { + Path::new(BACKENDS_CONFIG_PATH) + }; + if backends_path.exists() { + #[derive(serde::Deserialize)] + struct BackendsFile { + backends: Vec, + } + let backends_toml = fs::read_to_string(backends_path) + .unwrap_or_else(|_| panic!("Failed to read {:?}", backends_path)); + let backends_file: BackendsFile = + toml::from_str(&backends_toml).expect("Failed to parse backends.toml"); + settings.backends.extend(backends_file.backends); + } // Only write when content changes to avoid unnecessary recompilation. + let merged_toml = + toml::to_string_pretty(&settings).expect("Failed to serialize settings to TOML"); let dest_path = Path::new(TRUSTED_SERVER_OUTPUT_CONFIG_PATH); let current = fs::read_to_string(dest_path).unwrap_or_default(); if current != merged_toml { @@ -57,3 +101,32 @@ fn main() { .unwrap_or_else(|_| panic!("Failed to write {dest_path:?}")); } } + +fn collect_env_vars(value: &Value, env_vars: &mut HashSet, path: &[String]) { + if let Value::Object(map) = value { + for (key, val) in map { + let mut new_path = path.to_owned(); + new_path.push(key.to_uppercase()); + + match val { + Value::String(_) | Value::Number(_) | Value::Bool(_) => { + // Leaf node - create environment variable + let env_var = format!( + "{}{}{}", + settings::ENVIRONMENT_VARIABLE_PREFIX, + settings::ENVIRONMENT_VARIABLE_SEPARATOR, + new_path.join(settings::ENVIRONMENT_VARIABLE_SEPARATOR) + ); + env_vars.insert(env_var); + } + Value::Object(_) => { + // Recurse into nested objects + collect_env_vars(val, env_vars, &new_path); + } + // Arrays (e.g. `backends`) cannot be overridden per-element via env vars. + // Env overrides replace entire scalar fields; skip array values intentionally. + _ => {} + } + } + } +} diff --git a/crates/trusted-server-core/src/backend_router.rs b/crates/trusted-server-core/src/backend_router.rs new file mode 100644 index 00000000..f862f2f4 --- /dev/null +++ b/crates/trusted-server-core/src/backend_router.rs @@ -0,0 +1,524 @@ +use error_stack::Report; +use regex::Regex; +use std::borrow::Cow; +use std::collections::HashMap; + +use crate::error::TrustedServerError; +use crate::settings::{BackendRoutingConfig, PathPattern}; + +/// Backend routing system that selects the appropriate origin URL based on request host and path. +/// +/// Leverages Trusted Server's dynamic backend creation - we just need to select the right +/// origin URL, and the backend will be created automatically via [`crate::backend::BackendConfig::from_url()`]. +/// +/// Supports: +/// - Domain-based routing (exact match + www normalization) +/// - Path-based routing (optional prefix/regex patterns) +/// - Fallback to default origin +#[derive(Debug, Clone)] +pub struct BackendRouter { + routes: Vec, + domain_index: HashMap, + default_origin: String, + default_certificate_check: bool, +} + +#[derive(Debug, Clone)] +pub struct BackendRoute { + pub origin_url: String, + pub certificate_check: bool, + path_patterns: Vec, +} + +#[derive(Clone)] +struct CompiledPathPattern { + host: Option, + path_prefix: Option, + path_regex: Option, +} + +impl core::fmt::Debug for CompiledPathPattern { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("CompiledPathPattern") + .field("host", &self.host) + .field("path_prefix", &self.path_prefix) + .field("path_regex", &self.path_regex.as_ref().map(Regex::as_str)) + .finish() + } +} + +impl CompiledPathPattern { + fn new(pattern: &PathPattern) -> Result> { + let path_regex = pattern + .path_regex + .as_deref() + .map(|s| { + Regex::new(s).map_err(|e| { + Report::new(TrustedServerError::Configuration { + message: format!("Invalid path_regex pattern `{s}`: {e}"), + }) + }) + }) + .transpose()?; + + Ok(Self { + host: pattern.host.clone(), + path_prefix: pattern.path_prefix.clone(), + path_regex, + }) + } + + fn matches(&self, host: &str, path: &str) -> bool { + let host_matches = match &self.host { + None => true, + Some(pattern) if pattern == "*" => true, + Some(pattern) => { + let normalized_host = normalize_domain(host); + let normalized_pattern = normalize_domain(pattern); + normalized_host == normalized_pattern + } + }; + + if !host_matches { + return false; + } + + if let Some(prefix) = &self.path_prefix { + return path.starts_with(prefix); + } + + if let Some(ref regex) = self.path_regex { + return regex.is_match(path); + } + + true + } +} + +impl BackendRouter { + /// Creates a new [`BackendRouter`] from backend configurations. + /// + /// Backends are stored as origin URL + `certificate_check` pairs. + /// The actual Fastly backend will be created dynamically at request time. + /// + /// # Errors + /// + /// Returns an error if a path regex pattern fails to compile. + pub fn new( + backends: &[BackendRoutingConfig], + default_origin: String, + default_certificate_check: bool, + ) -> Result> { + let mut domain_index = HashMap::new(); + let mut routes = Vec::with_capacity(backends.len()); + + for (idx, backend) in backends.iter().enumerate() { + for domain in &backend.domains { + let normalized = normalize_domain(domain).into_owned(); + if let Some(existing_idx) = domain_index.insert(normalized.clone(), idx) { + log::warn!( + "Backend domain '{}' appears in multiple backends (index {} and {}); using backend {}", + normalized, existing_idx, idx, idx + ); + } + } + + let path_patterns = backend + .path_patterns + .iter() + .map(CompiledPathPattern::new) + .collect::, _>>()?; + + routes.push(BackendRoute { + origin_url: backend.origin_url.clone(), + certificate_check: backend.certificate_check, + path_patterns, + }); + } + + Ok(Self { + routes, + domain_index, + default_origin, + default_certificate_check, + }) + } + + /// Selects the appropriate origin URL and TLS settings based on request host and path. + /// + /// Selection priority: + /// 1. Exact domain match + /// 2. www. prefix normalization (www.example.com → example.com) + /// 3. Path pattern matching (prefix or regex) + /// 4. Fallback to default origin + /// + /// Returns `(origin_url, certificate_check)` tuple. + #[must_use] + pub fn select_origin(&self, host: &str, path: &str) -> (&str, bool) { + let normalized_host = normalize_domain(host); + + // Try domain index first (fastest lookup) + if let Some(&idx) = self.domain_index.get(normalized_host.as_ref()) { + let route = &self.routes[idx]; + return (&route.origin_url, route.certificate_check); + } + + // Try path patterns + for route in &self.routes { + for pattern in &route.path_patterns { + if pattern.matches(host, path) { + return (&route.origin_url, route.certificate_check); + } + } + } + + // Fallback to default + (&self.default_origin, self.default_certificate_check) + } +} + +/// Normalizes a domain by removing "www." prefix and converting to lowercase. +/// +/// Returns a [`Cow::Borrowed`] slice when no transformation is needed (already +/// lowercase, no "www." prefix), avoiding any allocation on the hot request path. +/// +/// # Examples +/// +/// ``` +/// use trusted_server_core::backend_router::normalize_domain; +/// +/// assert_eq!(normalize_domain("WWW.EXAMPLE.COM"), "example.com"); +/// assert_eq!(normalize_domain("www.example.com"), "example.com"); +/// assert_eq!(normalize_domain("example.com"), "example.com"); +/// assert_eq!(normalize_domain("sub.example.com"), "sub.example.com"); +/// ``` +#[must_use] +pub fn normalize_domain(domain: &str) -> Cow<'_, str> { + if domain.bytes().any(|b| b.is_ascii_uppercase()) { + let mut lower = domain.to_lowercase(); + while lower.starts_with("www.") { + lower.drain(..4); + } + Cow::Owned(lower) + } else { + Cow::Borrowed(domain.trim_start_matches("www.")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_test_backend(id: &str, domains: Vec<&str>, origin_url: &str) -> BackendRoutingConfig { + BackendRoutingConfig { + id: Some(id.to_string()), + origin_url: origin_url.to_string(), + domains: domains.into_iter().map(String::from).collect(), + path_patterns: vec![], + certificate_check: true, + } + } + + fn create_test_backend_with_patterns( + id: &str, + origin_url: &str, + patterns: Vec, + ) -> BackendRoutingConfig { + BackendRoutingConfig { + id: Some(id.to_string()), + origin_url: origin_url.to_string(), + domains: vec![], + path_patterns: patterns, + certificate_check: true, + } + } + + #[test] + fn test_exact_domain_match() { + let backends = vec![create_test_backend( + "backend-a", + vec!["site-a.example.com", "site-b.example.com"], + "https://backend-a.example.com", + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, cert_check) = router.select_origin("site-a.example.com", "/"); + assert_eq!(origin, "https://backend-a.example.com"); + assert!(cert_check); + + let (origin, _) = router.select_origin("site-b.example.com", "/article"); + assert_eq!(origin, "https://backend-a.example.com"); + } + + #[test] + fn test_www_prefix_normalization() { + let backends = vec![create_test_backend( + "backend-a", + vec!["site-a.example.com"], + "https://backend-a.example.com", + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("www.site-a.example.com", "/"); + assert_eq!(origin, "https://backend-a.example.com"); + + let (origin, _) = router.select_origin("WWW.SITE-A.EXAMPLE.COM", "/"); + assert_eq!(origin, "https://backend-a.example.com"); + } + + #[test] + fn test_subdomain_no_match() { + let backends = vec![create_test_backend( + "backend-a", + vec!["site-a.example.com"], + "https://backend-a.example.com", + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("trending.site-a.example.com", "/"); + assert_eq!( + origin, "https://default-origin.com", + "trending.site-a.example.com should fall back to default" + ); + } + + #[test] + fn test_path_prefix_matching() { + let backends = vec![create_test_backend_with_patterns( + "backend-b", + "https://backend-b.example.com", + vec![ + PathPattern { + host: Some("site-c.example.com".to_string()), + path_prefix: Some("/.api/".to_string()), + path_regex: None, + }, + PathPattern { + host: Some("site-c.example.com".to_string()), + path_prefix: Some("/my-account".to_string()), + path_regex: None, + }, + ], + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("site-c.example.com", "/.api/users"); + assert_eq!(origin, "https://backend-b.example.com"); + + let (origin, _) = router.select_origin("site-c.example.com", "/my-account/settings"); + assert_eq!(origin, "https://backend-b.example.com"); + + let (origin, _) = router.select_origin("site-c.example.com", "/articles"); + assert_eq!(origin, "https://default-origin.com"); + } + + #[test] + fn test_path_regex_matching() { + let backends = vec![create_test_backend_with_patterns( + "backend-c", + "https://backend-c.example.com", + vec![PathPattern { + host: Some("*".to_string()), + path_prefix: None, + path_regex: Some("^/image/upload/".to_string()), + }], + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = + router.select_origin("site-a.example.com", "/image/upload/v1234/photo.jpg"); + assert_eq!(origin, "https://backend-c.example.com"); + + let (origin, _) = router.select_origin("site-a.example.com", "/images/photo.jpg"); + assert_eq!(origin, "https://default-origin.com"); + } + + #[test] + fn test_wildcard_host_pattern() { + let backends = vec![create_test_backend_with_patterns( + "s3", + "http://s3.amazonaws.com", + vec![PathPattern { + host: None, + path_prefix: Some("/bucket/".to_string()), + path_regex: None, + }], + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("anydomain.com", "/bucket/file.txt"); + assert_eq!(origin, "http://s3.amazonaws.com"); + + let (origin, _) = router.select_origin("another.com", "/bucket/"); + assert_eq!(origin, "http://s3.amazonaws.com"); + } + + #[test] + fn test_fallback_to_default() { + let backends = vec![create_test_backend( + "backend-a", + vec!["site-a.example.com"], + "https://backend-a.example.com", + )]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("unknown.com", "/"); + assert_eq!(origin, "https://default-origin.com"); + } + + #[test] + fn test_multiple_backends_priority() { + let backends = vec![ + create_test_backend( + "backend-a", + vec!["site-a.example.com"], + "https://backend-a.example.com", + ), + create_test_backend( + "backend-b", + vec!["site-c.example.com"], + "https://backend-b.example.com", + ), + ]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("site-a.example.com", "/"); + assert_eq!(origin, "https://backend-a.example.com"); + + let (origin, _) = router.select_origin("site-c.example.com", "/"); + assert_eq!(origin, "https://backend-b.example.com"); + } + + #[test] + fn test_normalize_domain() { + assert_eq!(normalize_domain("example.com"), "example.com"); + assert_eq!(normalize_domain("www.example.com"), "example.com"); + assert_eq!(normalize_domain("WWW.EXAMPLE.COM"), "example.com"); + assert_eq!(normalize_domain("Www.Example.Com"), "example.com"); + assert_eq!(normalize_domain("sub.example.com"), "sub.example.com"); + assert_eq!( + normalize_domain("www.sub.example.com"), + "sub.example.com", + "should only strip leading www" + ); + } + + #[test] + fn test_certificate_check_setting() { + let backends = vec![BackendRoutingConfig { + id: Some("custom".to_string()), + origin_url: "https://custom-origin.com".to_string(), + domains: vec!["custom.com".to_string()], + path_patterns: vec![], + certificate_check: false, + }]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, cert_check) = router.select_origin("custom.com", "/"); + assert_eq!(origin, "https://custom-origin.com"); + assert!( + !cert_check, + "should respect backend-specific certificate_check" + ); + } + + #[test] + fn rejects_invalid_path_regex() { + let backends = vec![BackendRoutingConfig { + id: None, + origin_url: "https://example.com".to_string(), + domains: vec![], + path_patterns: vec![PathPattern { + host: None, + path_prefix: None, + path_regex: Some("[invalid".to_string()), + }], + certificate_check: true, + }]; + + let _err = BackendRouter::new(&backends, "https://default.com".to_string(), true) + .expect_err("should reject invalid path_regex pattern"); + } + + #[test] + fn duplicate_domain_uses_last_backend() { + let backends = vec![ + BackendRoutingConfig { + id: None, + origin_url: "https://first.com".to_string(), + domains: vec!["example.com".to_string()], + path_patterns: vec![], + certificate_check: true, + }, + BackendRoutingConfig { + id: None, + origin_url: "https://second.com".to_string(), + domains: vec!["example.com".to_string()], + path_patterns: vec![], + certificate_check: true, + }, + ]; + + let router = BackendRouter::new(&backends, "https://default.com".to_string(), true) + .expect("should succeed even with duplicate domains"); + + let (url, _) = router.select_origin("example.com", "/"); + assert_eq!( + url, "https://second.com", + "should route to last backend when domain appears twice" + ); + } + + #[test] + fn test_domain_and_path_pattern_precedence() { + let backends = vec![ + create_test_backend( + "backend-a", + vec!["site-c.example.com"], + "https://backend-a.example.com", + ), + create_test_backend_with_patterns( + "backend-b", + "https://backend-b.example.com", + vec![PathPattern { + host: Some("site-c.example.com".to_string()), + path_prefix: Some("/.api/".to_string()), + path_regex: None, + }], + ), + ]; + + let router = BackendRouter::new(&backends, "https://default-origin.com".to_string(), true) + .expect("should build router from valid backends config"); + + let (origin, _) = router.select_origin("site-c.example.com", "/"); + assert_eq!( + origin, "https://backend-a.example.com", + "domain match should take precedence over path pattern" + ); + + let (origin, _) = router.select_origin("site-c.example.com", "/.api/users"); + assert_eq!( + origin, "https://backend-a.example.com", + "domain match should still take precedence for API paths" + ); + } +} diff --git a/crates/trusted-server-core/src/integrations/datadome.rs b/crates/trusted-server-core/src/integrations/datadome.rs index e0f6e7c2..b237bed4 100644 --- a/crates/trusted-server-core/src/integrations/datadome.rs +++ b/crates/trusted-server-core/src/integrations/datadome.rs @@ -56,14 +56,15 @@ //! - Handles both `src` and `href` attributes (for preload/prefetch links) use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use error_stack::{Report, ResultExt}; use fastly::http::{header, Method, StatusCode}; use fastly::{Request, Response}; -use once_cell::sync::Lazy; use regex::Regex; use serde::Deserialize; +use std::sync::LazyLock; use validator::Validate; use crate::backend::BackendConfig; @@ -93,7 +94,7 @@ const DATADOME_INTEGRATION_ID: &str = "datadome"; /// - `'//js.datadome.co/js/check'` /// - `"api-js.datadome.co/js/check"` /// - `"js.datadome.co"` -static DATADOME_URL_PATTERN: Lazy = Lazy::new(|| { +static DATADOME_URL_PATTERN: LazyLock = LazyLock::new(|| { Regex::new(r#"(['"])(https?:)?(//)?(api-)?js\.datadome\.co(/[^'"]*)?(['"])"#) .expect("DataDome URL rewrite regex should compile") }); @@ -125,6 +126,38 @@ pub struct DataDomeConfig { /// Whether to rewrite `DataDome` script URLs in HTML to first-party paths #[serde(default = "default_rewrite_sdk")] pub rewrite_sdk: bool, + + // Server-side validation configuration + /// Enable server-side bot validation (default: false) + #[serde(default = "default_server_side_enabled")] + pub server_side_enabled: bool, + + /// `DataDome` API key for server-side validation + /// Can be set via environment variable: `TRUSTED_SERVER__INTEGRATIONS__DATADOME__SERVER_SIDE_KEY` + #[serde(default)] + pub server_side_key: Option, + + /// Validation endpoint (default: ) + #[serde(default = "default_validation_endpoint")] + #[validate(url)] + pub validation_endpoint: String, + + /// Validation request timeout in milliseconds (default: 200ms) + /// Lower timeout = faster fail-open on `DataDome` API issues + #[serde(default = "default_validation_timeout_ms")] + #[validate(range(min = 50, max = 1000))] + pub validation_timeout_ms: u64, + + /// Fail-open behavior when validation fails/times out (default: true = allow request) + /// Set to false only in production with high confidence in `DataDome` uptime + #[serde(default = "default_fail_open")] + pub fail_open: bool, + + /// Percentage of requests to validate (0-100, default: 100) + /// Used for gradual rollout and A/B testing + #[serde(default = "default_sample_rate")] + #[validate(range(min = 0, max = 100))] + pub sample_rate: u8, } fn default_enabled() -> bool { @@ -147,6 +180,26 @@ fn default_rewrite_sdk() -> bool { true } +fn default_server_side_enabled() -> bool { + false +} + +fn default_validation_endpoint() -> String { + "https://api-fastly.datadome.co".to_string() +} + +fn default_validation_timeout_ms() -> u64 { + 200 +} + +fn default_fail_open() -> bool { + true +} + +fn default_sample_rate() -> u8 { + 100 +} + impl Default for DataDomeConfig { fn default() -> Self { Self { @@ -155,6 +208,12 @@ impl Default for DataDomeConfig { api_origin: default_api_origin(), cache_ttl_seconds: default_cache_ttl(), rewrite_sdk: default_rewrite_sdk(), + server_side_enabled: default_server_side_enabled(), + server_side_key: None, + validation_endpoint: default_validation_endpoint(), + validation_timeout_ms: default_validation_timeout_ms(), + fail_open: default_fail_open(), + sample_rate: default_sample_rate(), } } } @@ -245,7 +304,8 @@ impl DataDomeIntegration { .trim_start_matches("http://") .split('/') .next() - .unwrap_or("api-js.datadome.co") + // split() on &str always yields at least one element, so this is unreachable + .unwrap_or(url) } /// Handle the /tags.js endpoint - fetch and rewrite the `DataDome` SDK. @@ -381,6 +441,196 @@ impl DataDomeIntegration { }) .unwrap_or("/tags.js") } + + /// Extract client IP address from request. + /// + /// Returns `"0.0.0.0"` if the client IP is unavailable (e.g., in test environments or when the platform does not expose the client address). + fn get_client_ip(req: &Request) -> String { + req.get_client_ip_addr() + .map(|ip| ip.to_string()) + .unwrap_or_else(|| "0.0.0.0".to_string()) + } + + /// Perform server-side bot validation using `DataDome` HTTP API. + /// + /// This validates requests at the edge before they reach the origin, providing + /// protection against bots and malicious traffic without VCL restarts. + /// + /// # Validation Flow + /// + /// 1. Sample check: Only validate `sample_rate` % of requests (for gradual rollout) + /// 2. Build validation request with headers: + /// - `x-datadome-params:key` (API key) + /// - `x-datadome-params:ip` (client IP) + /// - `x-datadome-params:clientid` (datadome cookie) + /// - `x-datadome-params:method` (HTTP method) + /// - Other request metadata + /// 3. Send to `DataDome` API with low timeout (configured via `validation_timeout_ms`, default 200ms) keeps fail-open fast + /// 4. Parse response: + /// - 200 OK → Allow request + /// - 403 Forbidden → Block request (bot detected) + /// - Timeout/Error → Fail-open (allow) if configured + /// + /// # Errors + /// + /// Returns `Ok(true)` to allow the request or `Ok(false)` to block it. + /// In fail-open mode (default), errors result in `Ok(true)`. + pub fn validate_request(&self, req: &Request) -> Result> { + let client_ip = Self::get_client_ip(req); + self.validate_request_inner(&client_ip, req) + } + + /// Compute a deterministic polynomial hash of a client IP string. + /// + /// Used for consistent sampling: the same IP always maps to the same bucket, + /// so sampled/unsampled status is stable across requests from the same client. + pub(crate) fn hash_client_ip(ip: &str) -> u32 { + ip.bytes().fold(0u32, |acc, b| { + acc.wrapping_mul(31).wrapping_add(u32::from(b)) + }) + } + + /// Extract the `datadome` cookie value from a `Cookie` header string. + /// + /// Returns `None` if the cookie is not present. + pub(crate) fn extract_datadome_cookie(cookie_header: &str) -> Option<&str> { + cookie_header.split(';').find_map(|cookie| { + let trimmed = cookie.trim(); + trimmed.strip_prefix("datadome=") + }) + } + + fn validate_request_inner( + &self, + client_ip: &str, + req: &Request, + ) -> Result> { + // Check if server-side validation is enabled + if !self.config.server_side_enabled { + return Ok(true); + } + + // Check if API key is configured + let api_key = match &self.config.server_side_key { + Some(key) if !key.is_empty() => key, + _ => { + log::warn!("[datadome] Server-side validation enabled but no API key configured"); + return Ok(self.config.fail_open); + } + }; + + // Sample rate check: only validate sample_rate% of requests + if self.config.sample_rate < 100 { + // Use client IP hash for deterministic sampling + let hash = Self::hash_client_ip(client_ip); + let sample = (hash % 100) as u8; + + if sample >= self.config.sample_rate { + log::debug!( + "[datadome] Request not sampled (sample={}, rate={})", + sample, + self.config.sample_rate + ); + return Ok(true); + } + } + + // Extract DataDome cookie (clientid) + let datadome_cookie = req + .get_header(header::COOKIE) + .and_then(|h| h.to_str().ok()) + .and_then(Self::extract_datadome_cookie) + .unwrap_or(""); + + // Build validation request + let validation_url = format!("{}/validate-request", self.config.validation_endpoint); + + log::info!( + "[datadome] Server-side validation: method={}, ip={}, cookie_present={}", + req.get_method(), + client_ip, + !datadome_cookie.is_empty() + ); + + let timeout = Duration::from_millis(self.config.validation_timeout_ms); + let backend = + BackendConfig::from_url_with_first_byte_timeout(&validation_url, true, timeout) + .change_context(Self::error("Invalid validation endpoint URL"))?; + + let mut validation_req = Request::new(Method::POST, &validation_url); + + // Set DataDome validation headers + validation_req.set_header("x-datadome-params:key", api_key); + validation_req.set_header("x-datadome-params:requestmodulename", "TrustedServerRust"); + validation_req.set_header("x-datadome-params:moduleversion", "1.0"); + validation_req.set_header("x-datadome-params:ip", client_ip); + validation_req.set_header("x-datadome-params:method", req.get_method().as_str()); + + if !datadome_cookie.is_empty() { + validation_req.set_header( + "x-datadome-params:clientid", + urlencoding::encode(datadome_cookie).as_ref(), + ); + } + + // Copy relevant headers for fingerprinting + if let Some(ua) = req.get_header(header::USER_AGENT) { + validation_req.set_header(header::USER_AGENT, ua); + } + if let Some(accept) = req.get_header(header::ACCEPT) { + validation_req.set_header(header::ACCEPT, accept); + } + if let Some(accept_lang) = req.get_header(header::ACCEPT_LANGUAGE) { + validation_req.set_header(header::ACCEPT_LANGUAGE, accept_lang); + } + + // Use configured timeout to prevent stalls under DataDome API failures + + // Send validation request + let validation_result = validation_req.send(&backend); + + match validation_result { + Ok(resp) => { + let status = resp.get_status(); + log::info!("[datadome] Validation response: {}", status); + + if status == StatusCode::OK { + // Request validated - allow through + Ok(true) + } else if status == StatusCode::FORBIDDEN { + // Bot detected - block request + log::warn!("[datadome] Request blocked by DataDome (403)"); + Ok(false) + } else { + // Unexpected status - apply fail-open policy + log::warn!( + "[datadome] Unexpected validation response: {} (fail_open={})", + status, + self.config.fail_open + ); + Ok(self.config.fail_open) + } + } + Err(e) => { + // Request failed (timeout, network error, etc.) + log::warn!( + "[datadome] Validation request failed: {:?} (fail_open={})", + e, + self.config.fail_open + ); + + if self.config.fail_open { + // Fail-open: allow request despite validation failure + Ok(true) + } else { + // Fail-closed: block request on validation errors + Err(Report::new(Self::error( + "Validation failed and fail_open=false", + ))) + } + } + } + } } #[async_trait(?Send)] @@ -465,6 +715,16 @@ fn build(settings: &Settings) -> Option> { let config = match settings.integration_config::(DATADOME_INTEGRATION_ID) { Ok(Some(config)) => config, Ok(None) => { + // IntegrationSettings derefs to HashMap, so we can inspect + // the raw config even when enabled=false to detect the common misconfiguration + // of setting server_side_enabled=true without setting enabled=true. + if let Some(raw) = settings.integrations.get(DATADOME_INTEGRATION_ID) { + if let Ok(raw_config) = serde_json::from_value::(raw.clone()) { + if raw_config.server_side_enabled { + log::warn!("[datadome] server_side_enabled = true has no effect when enabled = false; server-side validation is inactive"); + } + } + } log::debug!("[datadome] Integration disabled or not configured"); return None; } @@ -475,14 +735,64 @@ fn build(settings: &Settings) -> Option> { }; log::info!( - "[datadome] Registering integration (sdk_origin: {}, rewrite_sdk: {})", + "[datadome] Registering integration (sdk_origin: {}, rewrite_sdk: {}, server_side: {})", config.sdk_origin, - config.rewrite_sdk + config.rewrite_sdk, + config.server_side_enabled ); Some(DataDomeIntegration::new(config)) } +/// Perform server-side `DataDome` validation on a request. +/// +/// This is the public API for validating requests before they reach the origin. +/// Call this from request handlers to check for bots/malicious traffic. +/// +/// # Returns +/// +/// - `Ok(true)` - Request should be allowed through +/// - `Ok(false)` - Request should be blocked (bot detected) +/// - `Err(...)` - Validation error with `fail_open=false` +/// +/// # Errors +/// +/// Only returns an error if validation fails AND `fail_open=false`. +/// With default `fail_open=true`, errors result in `Ok(true)`. +/// +/// # Examples +/// +/// ```no_run +/// use trusted_server_core::integrations::datadome; +/// use trusted_server_core::settings::Settings; +/// use fastly::Request; +/// +/// fn handle_request(settings: &Settings, req: &Request) { +/// match datadome::validate_request_server_side(settings, req) { +/// Ok(true) => { +/// // Allow request - proceed to origin +/// } +/// Ok(false) => { +/// // Block request - bot detected +/// // Return 403 or captcha page +/// } +/// Err(e) => { +/// // Validation error with fail_open=false +/// // Handle as appropriate +/// } +/// } +/// } +/// ``` +pub fn validate_request_server_side( + settings: &Settings, + req: &Request, +) -> Result> { + match build(settings) { + Some(integration) => integration.validate_request(req), + None => Ok(true), + } +} + /// Register the `DataDome` integration with Trusted Server. #[must_use] pub fn register(settings: &Settings) -> Option { @@ -498,6 +808,9 @@ pub fn register(settings: &Settings) -> Option { #[cfg(test)] mod tests { + use fastly::http::Method; + use fastly::Request; + use super::*; fn test_config() -> DataDomeConfig { @@ -507,6 +820,12 @@ mod tests { api_origin: "https://api-js.datadome.co".to_string(), cache_ttl_seconds: 3600, rewrite_sdk: true, + server_side_enabled: false, + server_side_key: None, + validation_endpoint: "https://api-fastly.datadome.co".to_string(), + validation_timeout_ms: 200, + fail_open: true, + sample_rate: 100, } } @@ -815,4 +1134,129 @@ mod tests { _ => panic!("Expected Replace action for bare domain"), } } + + fn server_side_config(overrides: impl FnOnce(&mut DataDomeConfig)) -> DataDomeConfig { + let mut cfg = test_config(); + cfg.server_side_enabled = true; + cfg.server_side_key = Some("test-api-key".to_string()); + overrides(&mut cfg); + cfg + } + + #[test] + fn validation_disabled_fast_path() { + let cfg = test_config(); // server_side_enabled: false + let integration = DataDomeIntegration::new(cfg); + let req = Request::new(Method::GET, "https://example.com/page"); + let result = integration + .validate_request_inner("1.2.3.4", &req) + .expect("should return Ok when validation is disabled"); + assert!( + result, + "should allow request when server-side validation is disabled" + ); + } + + #[test] + fn missing_key_fail_open() { + let cfg = server_side_config(|c| { + c.server_side_key = None; + c.fail_open = true; + }); + let integration = DataDomeIntegration::new(cfg); + let req = Request::new(Method::GET, "https://example.com/page"); + let result = integration + .validate_request_inner("1.2.3.4", &req) + .expect("should return Ok(true) when fail_open=true and key is missing"); + assert!( + result, + "should allow request when key is missing and fail_open=true" + ); + } + + #[test] + fn missing_key_fail_closed() { + // When the key is missing (misconfiguration), the fail_open flag controls + // whether to block (false) or allow (true) — but always as Ok, not Err. + // Err is only returned on network failures with fail_open=false. + let cfg = server_side_config(|c| { + c.server_side_key = None; + c.fail_open = false; + }); + let integration = DataDomeIntegration::new(cfg); + let req = Request::new(Method::GET, "https://example.com/page"); + let result = integration + .validate_request_inner("1.2.3.4", &req) + .expect("should return Ok when key is missing"); + assert!( + !result, + "should block request when key is missing and fail_open=false" + ); + } + + #[test] + fn sample_rate_zero_skips_all_requests() { + let cfg = server_side_config(|c| c.sample_rate = 0); + let integration = DataDomeIntegration::new(cfg); + let req = Request::new(Method::GET, "https://example.com/page"); + let result = integration + .validate_request_inner("1.2.3.4", &req) + .expect("should return Ok when sample_rate=0"); + assert!( + result, + "should allow all requests when sample_rate=0 (nothing sampled)" + ); + } + + #[test] + fn sample_rate_full_does_not_skip() { + // sample_rate=100 means all requests go to validation + // With a key but no real API, this will try to make an HTTP call (which will fail in test env) + // We only check that we DON'T early-return due to sampling + // This is tested indirectly: with sample_rate=100, missing key path returns fail_open + let mut cfg = test_config(); + cfg.server_side_enabled = true; + cfg.server_side_key = None; // will hit the missing-key guard + cfg.sample_rate = 100; + cfg.fail_open = true; + let integration = DataDomeIntegration::new(cfg); + let req = Request::new(Method::GET, "https://example.com/page"); + // With rate=100, no early-return from sampling; reaches key check; key missing; fail_open=true → Ok(true) + let result = integration + .validate_request_inner("1.2.3.4", &req) + .expect("should return Ok"); + assert!( + result, + "should allow request (no key, fail_open=true, sample_rate=100)" + ); + } + + #[test] + fn cookie_extracted_from_multi_cookie_header() { + let header = "session=abc; datadome=token123; other=xyz"; + let extracted = DataDomeIntegration::extract_datadome_cookie(header); + assert_eq!( + extracted, + Some("token123"), + "should extract datadome cookie value from multi-cookie header" + ); + } + + #[test] + fn deterministic_sampling_hash() { + let ip = "192.168.1.100"; + let hash1 = DataDomeIntegration::hash_client_ip(ip); + let hash2 = DataDomeIntegration::hash_client_ip(ip); + assert_eq!( + hash1, hash2, + "should produce identical hash for same IP across calls" + ); + + let ip2 = "192.168.100.1"; + let hash3 = DataDomeIntegration::hash_client_ip(ip2); + assert_ne!( + hash1, hash3, + "should produce different hash for different IP" + ); + } } diff --git a/crates/trusted-server-core/src/lib.rs b/crates/trusted-server-core/src/lib.rs index 1fd95369..7f2bed7d 100644 --- a/crates/trusted-server-core/src/lib.rs +++ b/crates/trusted-server-core/src/lib.rs @@ -36,6 +36,7 @@ pub mod auction; pub mod auction_config_types; pub mod auth; pub mod backend; +pub mod backend_router; pub mod consent; pub mod consent_config; pub mod constants; diff --git a/crates/trusted-server-core/src/publisher.rs b/crates/trusted-server-core/src/publisher.rs index cec56b25..53fdd666 100644 --- a/crates/trusted-server-core/src/publisher.rs +++ b/crates/trusted-server-core/src/publisher.rs @@ -3,6 +3,7 @@ use fastly::http::{header, StatusCode}; use fastly::{Body, Request, Response}; use crate::backend::BackendConfig; +use crate::backend_router::BackendRouter; use crate::consent::{allows_ssc_creation, build_consent_context, ConsentPipelineInput}; use crate::constants::{COOKIE_SYNTHETIC_ID, HEADER_X_COMPRESS_HINT, HEADER_X_SYNTHETIC_ID}; use crate::cookies::{expire_synthetic_cookie, handle_request_cookies, set_synthetic_cookie}; @@ -271,17 +272,82 @@ pub fn handle_publisher_request( ssc_allowed, ); - let backend_name = BackendConfig::from_url( - &settings.publisher.origin_url, - settings.proxy.certificate_check, - )?; - let origin_host = settings.publisher.origin_host(); + let request_path = req.get_path(); + + let router = if settings.backends.is_empty() { + None + } else { + BackendRouter::new( + &settings.backends, + settings.publisher.origin_url.clone(), + settings.proxy.certificate_check, + ) + .map_err(|e| { + log::error!("Failed to build backend router: {:?}", e); + e + }) + .ok() + }; + + let (origin_url, certificate_check) = if let Some(ref router) = router { + let (url, cert_check) = router.select_origin(request_host, request_path); + log::info!( + "Backend routing: host={}, path={} → {}", + request_host, + request_path, + url + ); + (url, cert_check) + } else { + ( + settings.publisher.origin_url.as_str(), + settings.proxy.certificate_check, + ) + }; + + let backend_name = BackendConfig::from_url(origin_url, certificate_check)?; + + let origin_host = url::Url::parse(origin_url) + .ok() + .and_then(|url| { + url.host_str().map(|host| match url.port() { + Some(port) => format!("{}:{}", host, port), + None => host.to_string(), + }) + }) + .unwrap_or_else(|| origin_url.to_string()); log::debug!( "Proxying to dynamic backend: {} (from {})", backend_name, - settings.publisher.origin_url + origin_url ); + + // DataDome server-side validation (if enabled) + // This validates requests at the edge before they reach the origin + match crate::integrations::datadome::validate_request_server_side(settings, &req) { + Ok(true) => { + // Request allowed - continue to origin + log::debug!("[datadome] Request validated - proceeding to origin"); + } + Ok(false) => { + // Request blocked - bot detected + log::warn!("[datadome] Request blocked by server-side validation"); + let mut blocked_response = Response::from_status(StatusCode::FORBIDDEN); + blocked_response.set_body("Forbidden"); + blocked_response.set_header(header::CONTENT_TYPE, "text/plain"); + return Ok(blocked_response); + } + Err(e) => { + // Validation error with fail_open=false + log::error!("[datadome] Validation error: {:?}", e); + let mut error_response = Response::from_status(StatusCode::SERVICE_UNAVAILABLE); + error_response.set_body("Service Temporarily Unavailable"); + error_response.set_header(header::CONTENT_TYPE, "text/plain"); + return Ok(error_response); + } + } + req.set_header("host", &origin_host); let mut response = req @@ -328,7 +394,7 @@ pub fn handle_publisher_request( let params = ProcessResponseParams { content_encoding: &content_encoding, origin_host: &origin_host, - origin_url: &settings.publisher.origin_url, + origin_url, request_host, request_scheme, settings, diff --git a/crates/trusted-server-core/src/settings.rs b/crates/trusted-server-core/src/settings.rs index 0b1dda53..6116277f 100644 --- a/crates/trusted-server-core/src/settings.rs +++ b/crates/trusted-server-core/src/settings.rs @@ -327,6 +327,38 @@ impl Default for Proxy { } } +#[derive(Debug, Clone, Deserialize, Serialize, Validate)] +#[validate(schema(function = validate_path_pattern))] +pub struct PathPattern { + /// Optional host pattern. If None, matches all hosts (wildcard). + pub host: Option, + /// Optional path prefix to match (e.g., "/.api/"). + pub path_prefix: Option, + /// Optional path regex pattern to match (e.g., "^/image/upload/"). + pub path_regex: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, Validate)] +pub struct BackendRoutingConfig { + /// Origin URL for this backend (e.g., ). + /// The actual Fastly backend will be created dynamically at request time. + #[validate(url)] + pub origin_url: String, + /// List of domains that should route to this backend. + #[serde(default)] + pub domains: Vec, + /// Optional path-based routing patterns. + #[serde(default)] + #[validate(nested)] + pub path_patterns: Vec, + /// Enable TLS certificate verification for this backend. + #[serde(default = "default_certificate_check")] + pub certificate_check: bool, + /// Unique identifier for logging/debugging (optional). + #[serde(default)] + pub id: Option, +} + #[derive(Debug, Default, Clone, Deserialize, Serialize, Validate)] pub struct Settings { #[validate(nested)] @@ -351,6 +383,11 @@ pub struct Settings { pub consent: ConsentConfig, #[serde(default)] pub proxy: Proxy, + /// Optional multi-backend routing configuration. + /// Use `BackendRouter::new()` to create a router from this config. + #[serde(default)] + #[validate(nested)] + pub backends: Vec, } #[allow(unused)] @@ -512,6 +549,16 @@ impl Settings { } } +fn validate_path_pattern(pattern: &PathPattern) -> Result<(), ValidationError> { + if pattern.path_prefix.is_some() && pattern.path_regex.is_some() { + let mut err = ValidationError::new("conflicting_path_pattern"); + err.message = + Some("path_prefix and path_regex are mutually exclusive; set only one".into()); + return Err(err); + } + Ok(()) +} + fn validate_no_trailing_slash(value: &str) -> Result<(), ValidationError> { if value.ends_with('/') { let mut err = ValidationError::new("trailing_slash"); @@ -1529,4 +1576,76 @@ mod tests { ); } } + + #[test] + fn backend_routing_config_accepts_valid_origin_url() { + let config = BackendRoutingConfig { + origin_url: "https://raven-public.prod.saymedia.com".to_string(), + domains: vec![], + path_patterns: vec![], + certificate_check: true, + id: None, + }; + config + .validate() + .expect("should accept a valid HTTPS origin URL"); + } + + #[test] + fn backend_routing_config_rejects_empty_origin_url() { + let config = BackendRoutingConfig { + origin_url: "".to_string(), + domains: vec![], + path_patterns: vec![], + certificate_check: true, + id: None, + }; + config + .validate() + .expect_err("should reject empty origin_url"); + } + + #[test] + fn backend_routing_config_rejects_bare_hostname() { + let config = BackendRoutingConfig { + origin_url: "raven-public.prod.saymedia.com".to_string(), + domains: vec![], + path_patterns: vec![], + certificate_check: true, + id: None, + }; + config + .validate() + .expect_err("should reject bare hostname without scheme"); + } + + #[test] + fn backend_routing_config_rejects_non_url_string() { + let config = BackendRoutingConfig { + origin_url: "not-a-url".to_string(), + domains: vec![], + path_patterns: vec![], + certificate_check: true, + id: None, + }; + config + .validate() + .expect_err("should reject non-URL string as origin_url"); + } + + #[test] + fn backend_routing_config_accepts_http_origin_url() { + // HTTP origins are valid — internal backends may not use TLS. + // The validate(url) rule accepts any valid URL scheme. + let config = BackendRoutingConfig { + origin_url: "http://internal-service.example.com".to_string(), + domains: vec![], + path_patterns: vec![], + certificate_check: true, + id: None, + }; + config + .validate() + .expect("should accept a valid HTTP origin URL"); + } } diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index 6b7e9396..1b113370 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -117,6 +117,10 @@ export default withMermaid( link: '/guide/proxy-signing', }, { text: 'Collective Sync', link: '/guide/collective-sync' }, + { + text: 'Multi-Backend Routing', + link: '/guide/multi-backend-routing', + }, ], }, { diff --git a/docs/guide/integrations/datadome.md b/docs/guide/integrations/datadome.md index 8d063a62..bc4358c6 100644 --- a/docs/guide/integrations/datadome.md +++ b/docs/guide/integrations/datadome.md @@ -161,6 +161,91 @@ curl -X POST https://your-domain.com/integrations/datadome/js/check Ensure `rewrite_sdk = true` and that your pages are being proxied through Trusted Server's HTML processing pipeline. +## Server-Side Validation + +In addition to first-party JS delivery, Trusted Server can call the DataDome +server-side API to validate each request at the edge before forwarding it to +your origin. Bots are blocked with a `403 Forbidden` response without ever +reaching your backend. + +### How It Works + +```mermaid +sequenceDiagram + participant Browser + participant TS as Trusted Server + participant DD as api-fastly.datadome.co + participant Origin + + Browser->>TS: GET /article + TS->>DD: POST /validate-request\n(IP, path, headers, cookie) + DD-->>TS: 200 OK (allow) or 403 (block) + alt allowed + TS->>Origin: Proxy request + Origin-->>TS: Response + TS-->>Browser: Response + else blocked + TS-->>Browser: 403 Forbidden + end +``` + +Validation runs before the request reaches your origin. The DataDome cookie +(`datadome=`) is forwarded when present so DataDome can maintain session +continuity for users it has already classified. + +### Configuration + +```toml +[integrations.datadome] +enabled = true +server_side_enabled = true +server_side_key = "your-datadome-server-side-key" +``` + +Set `server_side_key` via environment variable to keep it out of +`trusted-server.toml`: + +```bash +TRUSTED_SERVER__INTEGRATIONS__DATADOME__SERVER_SIDE_KEY=your-key +``` + +### Configuration Options + +| Option | Type | Default | Description | +| ----------------------- | ------- | -------------------------------- | -------------------------------------------------------------------- | +| `server_side_enabled` | boolean | `false` | Enable server-side validation | +| `server_side_key` | string | — | DataDome server-side API key (required when enabled) | +| `validation_endpoint` | string | `https://api-fastly.datadome.co` | DataDome validation API base URL | +| `validation_timeout_ms` | integer | `200` | Timeout for the validation request (50–1000 ms) | +| `fail_open` | boolean | `true` | Allow the request if validation times out or errors | +| `sample_rate` | integer | `100` | Percentage of requests to validate (0–100). Use for gradual rollout. | + +### Gradual Rollout + +Use `sample_rate` to enable validation for a fraction of traffic while you +gain confidence: + +```toml +[integrations.datadome] +server_side_enabled = true +server_side_key = "your-key" +sample_rate = 10 # validate 10% of requests +``` + +Increase toward 100 once you're satisfied with the block rate and latency +impact. Sampling is IP-stable — a given IP address consistently falls in or out +of the sampled set across requests. + +### Fail-Open vs Fail-Closed + +`fail_open = true` (the default) means any DataDome API error or timeout +results in the request being allowed through. This keeps your site available +even if DataDome is unreachable. + +`fail_open = false` blocks requests whenever validation cannot be completed. +Only use this after validating DataDome uptime in your region and at your +traffic volume. + ## See Also - [DataDome First-Party Integration Docs](https://docs.datadome.co/docs/integrations#first-party-javascript-tag) diff --git a/docs/guide/multi-backend-routing.md b/docs/guide/multi-backend-routing.md new file mode 100644 index 00000000..6cd4ec22 --- /dev/null +++ b/docs/guide/multi-backend-routing.md @@ -0,0 +1,184 @@ +# Multi-Backend Routing + +Route incoming requests to different origin servers based on the request's host +domain or URL path. This is useful when a single Trusted Server deployment +serves multiple publishers or products that live on different backend origins. + +## Overview + +By default, all requests proxy to the single `origin_url` defined in +`[publisher]`. Multi-backend routing lets you override that origin on a +per-request basis using two matching strategies: + +- **Domain matching** — exact hostname match (with automatic `www.` stripping) +- **Path pattern matching** — URL prefix or regex match, optionally scoped to a + specific host + +Backends are evaluated in order. Domain matches take priority over path +patterns. Unmatched requests fall back to the default `publisher.origin_url`. + +## Configuration + +Backends are declared as `[[backends]]` entries in `trusted-server.toml` (or +a separate `backends.toml` file merged at build time — see +[Separating Customer Config](#separating-customer-config)). + +### Domain-Based Routing + +Route all traffic for a set of domains to a specific origin: + +```toml +[[backends]] +id = "site-a" +origin_url = "https://origin-a.example.com" +domains = ["site-a.example.com", "www.site-a.example.com"] + +[[backends]] +id = "site-b" +origin_url = "https://origin-b.example.com" +domains = ["site-b.example.com"] +``` + +`www.` prefixes are stripped before matching, so `www.site-a.example.com` and +`site-a.example.com` both resolve to the same backend entry. + +### Path-Based Routing + +Route a subset of paths to a different origin, optionally scoped to a specific +host: + +```toml +[[backends]] +id = "api" +origin_url = "https://api.example.com" + + [[backends.path_patterns]] + host = "site-a.example.com" + path_prefix = "/.api/" + + [[backends.path_patterns]] + host = "site-a.example.com" + path_prefix = "/my-account" +``` + +Use a regular expression when prefix matching is not precise enough: + +```toml +[[backends]] +id = "image-cdn" +origin_url = "https://cdn.example.com" + + [[backends.path_patterns]] + host = "*" + path_regex = "^/image/upload/" +``` + +Setting `host = "*"` (or omitting `host`) matches any hostname. + +### TLS Settings + +Each backend can control whether TLS certificates are verified: + +```toml +[[backends]] +id = "internal" +origin_url = "http://internal.corp" +certificate_check = false # disable TLS verification for internal backends +domains = ["internal.example.com"] +``` + +> **Warning:** Only disable `certificate_check` for internal origins you fully +> control. Disabling it for public origins exposes requests to interception. + +### Reference + +| Field | Type | Default | Description | +| ------------------- | ----------------- | ------- | ------------------------------------------------ | +| `id` | string | — | Optional label used in log output for debugging | +| `origin_url` | string (URL) | — | **Required.** Backend origin URL | +| `domains` | array of strings | `[]` | Hostnames to route to this backend | +| `path_patterns` | array of patterns | `[]` | Path-based routing rules (see below) | +| `certificate_check` | boolean | `true` | Verify TLS certificate on the backend connection | + +**Path pattern fields:** + +| Field | Type | Description | +| ------------- | ------ | ------------------------------------------------------------- | +| `host` | string | Hostname to scope this pattern to. `"*"` or omit for any host | +| `path_prefix` | string | Route requests whose path starts with this string | +| `path_regex` | string | Route requests whose path matches this regular expression | + +Only one of `path_prefix` or `path_regex` should be set per pattern entry. + +## Selection Priority + +For each request Trusted Server picks an origin in this order: + +1. **Domain index** — exact hostname match (after `www.` stripping) +2. **Path patterns** — first matching pattern across all backend entries +3. **Default** — `publisher.origin_url` + +Because domain matches are checked before path patterns, a backend that declares +both `domains` and `path_patterns` will always be reached via its domain match; +its path patterns only fire for hostnames not covered by any domain entry. + +## Separating Customer Config + +For deployments serving many sites, keep domain lists out of +`trusted-server.toml` by placing them in a separate file: + +``` +crates/trusted-server-adapter-fastly/backends.toml +``` + +`build.rs` merges this file into the embedded config at compile time. The file +uses the same `[[backends]]` syntax and is invisible to the shared application +template. + +```toml +# crates/trusted-server-adapter-fastly/backends.toml +# Merged at build time by crates/trusted-server-core/build.rs + +[[backends]] +id = "raven" +origin_url = "https://origin.prod.example.com" +certificate_check = true +domains = [ + "site-a.example.com", + "site-b.example.com", + # ... additional domains +] +``` + +## How It Works + +```mermaid +flowchart TD + req["Incoming Request\nHost: site-a.example.com\nPath: /.api/users"] + + subgraph router["BackendRouter"] + domain["Domain index lookup\nsite-a.example.com → backend-a?"] + path["Path pattern scan\npath_prefix: /.api/ match?"] + fallback["Default origin\npublisher.origin_url"] + end + + origin_a["origin-a.example.com"] + origin_b["api.example.com"] + origin_default["default-origin.example.com"] + + req --> domain + domain -->|"match"| origin_a + domain -->|"no match"| path + path -->|"match"| origin_b + path -->|"no match"| fallback + fallback --> origin_default +``` + +The router is built once per request from the embedded config. Dynamic Fastly +backends are created on demand — the backend name encodes the origin URL, port, +TLS settings, and timeout so that configurations never collide. + +## See Also + +- [Configuration](/guide/configuration) +- [Architecture](/guide/architecture) diff --git a/scripts/integration-tests.sh b/scripts/integration-tests.sh index 336236b6..b12138f2 100755 --- a/scripts/integration-tests.sh +++ b/scripts/integration-tests.sh @@ -75,3 +75,20 @@ RUST_LOG=info \ --manifest-path crates/integration-tests/Cargo.toml \ --target "$TARGET" \ -- --include-ignored --test-threads=1 "${TEST_ARGS[@]}" + +echo "==> Building routing WASM binary (test backends: ports 19090-19093)..." +ROUTING_TEST_BACKENDS=1 \ +TRUSTED_SERVER__PUBLISHER__ORIGIN_URL="http://127.0.0.1:19090" \ +TRUSTED_SERVER__PUBLISHER__PROXY_SECRET="integration-test-proxy-secret" \ +TRUSTED_SERVER__SYNTHETIC__SECRET_KEY="integration-test-secret-key" \ +TRUSTED_SERVER__PROXY__CERTIFICATE_CHECK=false \ + cargo build --package trusted-server-adapter-fastly --release --target wasm32-wasip1 + +echo "==> Running routing integration tests..." +ROUTING_WASM_PATH="$REPO_ROOT/target/wasm32-wasip1/release/trusted-server-adapter-fastly.wasm" \ +RUST_LOG=info \ + cargo test \ + --manifest-path crates/integration-tests/Cargo.toml \ + --target "$TARGET" \ + --test routing \ + -- --test-threads=1 diff --git a/trusted-server.toml b/trusted-server.toml index 4a4f3128..c0a7fd1f 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -99,6 +99,11 @@ rewrite_sdk = true # Endpoints: # GET /integrations/datadome/tags.js - Proxied SDK script # ANY /integrations/datadome/js/* - Signal collection API +# +# Server-side validation (NEW): +# Validates requests at the edge BEFORE they reach origin +# Replaces expensive VCL restart patterns with native Rust HTTP API calls +# Staging-friendly: feature flag, fail-open, low timeout, sampling support [integrations.datadome] enabled = false sdk_origin = "https://js.datadome.co" @@ -106,6 +111,17 @@ api_origin = "https://api-js.datadome.co" cache_ttl_seconds = 3600 rewrite_sdk = true +# Server-side validation settings (optional) +# Enable to validate requests before proxying to origin (replaces VCL preflight) +server_side_enabled = false +# API key for validation (set via environment variable in production): +# TRUSTED_SERVER__INTEGRATIONS__DATADOME__SERVER_SIDE_KEY="your-api-key-here" +# server_side_key = "" +validation_endpoint = "https://api-fastly.datadome.co" +validation_timeout_ms = 200 # Low timeout for fast fail-open +fail_open = true # Allow requests if validation fails/times out (safety) +sample_rate = 100 # Percentage of requests to validate (0-100, for gradual rollout) + [integrations.gpt] enabled = false script_url = "https://securepubads.g.doubleclick.net/tag/js/gpt.js" @@ -153,6 +169,21 @@ rewrite_script = true # Defaults to true. Set to false only for local development with self-signed certificates. # certificate_check = true +# Multi-backend routing — routes traffic to different origins based on host/path. +# Customer-specific backend lists live in crates/fastly/backends.toml and are +# merged into the embedded config at build time. +# +# Schema reference: +# [[backends]] +# id = "my-backend" # Optional label for logging +# origin_url = "https://origin.example.com" +# certificate_check = true +# domains = ["example.com", "www.example.com"] +# path_patterns = [ +# { host = "example.com", path_prefix = "/api/" }, +# { host = "*", path_regex = "^/images/" }, +# ] + [auction] enabled = true providers = ["prebid"]