From d8b53e3fe6847d1def17f1eeb1ee588912350e78 Mon Sep 17 00:00:00 2001 From: Javier Arias Date: Wed, 25 Mar 2026 11:21:17 +0000 Subject: [PATCH 01/19] WIP OAI-PMH --- Cargo.lock | 110 ++- Cargo.toml | 10 +- Makefile | 6 +- src/bin/arguments/mod.rs | 11 + src/bin/commands/start.rs | 40 +- src/bin/thoth.rs | 1 + src/lib.rs | 1 + thoth-api/Cargo.toml | 8 +- thoth-api/src/model/work_relation/crud.rs | 6 +- thoth-client/Cargo.toml | 2 +- thoth-client/assets/queries.graphql | 94 +++ thoth-client/src/lib.rs | 125 ++- thoth-client/src/parameters.rs | 173 ++++- thoth-client/src/queries.rs | 77 ++ thoth-errors/Cargo.toml | 2 +- thoth-export-server/Cargo.toml | 2 +- .../src/bibtex/bibtex_thoth.rs | 1 + thoth-export-server/src/csv/csv_thoth.rs | 1 + thoth-export-server/src/csv/kbart_oclc.rs | 4 + thoth-export-server/src/json/json_thoth.rs | 2 + .../src/marc21/marc21record_thoth.rs | 4 + .../src/xml/doideposit_crossref.rs | 2 + .../src/xml/onix21_ebsco_host.rs | 4 + .../src/xml/onix21_proquest_ebrary.rs | 4 + thoth-export-server/src/xml/onix31_thoth.rs | 1 + .../src/xml/onix3_google_books.rs | 4 + thoth-export-server/src/xml/onix3_jstor.rs | 4 + thoth-export-server/src/xml/onix3_oapen.rs | 4 + .../src/xml/onix3_overdrive.rs | 4 + .../src/xml/onix3_project_muse.rs | 4 + thoth-export-server/src/xml/onix3_thoth.rs | 1 + thoth-oai-server/Cargo.toml | 26 + thoth-oai-server/README.md | 3 + thoth-oai-server/assets/oai2.xsl | 707 +++++++++++++++++ thoth-oai-server/src/lib.rs | 572 ++++++++++++++ thoth-oai-server/src/metadata.rs | 731 ++++++++++++++++++ thoth-oai-server/src/service.rs | 527 +++++++++++++ 37 files changed, 3236 insertions(+), 42 deletions(-) create mode 100644 thoth-oai-server/Cargo.toml create mode 100644 thoth-oai-server/README.md create mode 100644 thoth-oai-server/assets/oai2.xsl create mode 100644 thoth-oai-server/src/lib.rs create mode 100644 thoth-oai-server/src/metadata.rs create mode 100644 thoth-oai-server/src/service.rs diff --git a/Cargo.lock b/Cargo.lock index 6dcef3b41..86e5b4d15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1395,14 +1395,38 @@ version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f8a51dd197fa6ba5b4dc98a990a43cc13693c23eb0089ebb0fcc1f04152bca6" +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + [[package]] name = "darling" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", ] [[package]] @@ -1419,13 +1443,24 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.117", +] + [[package]] name = "darling_macro" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ - "darling_core", + "darling_core 0.21.3", "quote", "syn 2.0.117", ] @@ -1552,15 +1587,14 @@ dependencies = [ [[package]] name = "diesel" -version = "2.3.6" +version = "2.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b6c2fc184a6fb6ebcf5f9a5e3bbfa84d8fd268cdfcce4ed508979a6259494d" +checksum = "470eb10efc8646313634c99bb1593f402a6434cbd86e266770c6e39219adb86a" dependencies = [ "bitflags 2.11.0", "byteorder", "chrono", "diesel_derives", - "downcast-rs", "itoa", "pq-sys", "r2d2", @@ -1593,9 +1627,9 @@ dependencies = [ [[package]] name = "diesel_derives" -version = "2.3.7" +version = "2.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47618bf0fac06bb670c036e48404c26a865e6a71af4114dfd97dfe89936e404e" +checksum = "1b96984c469425cb577bf6f17121ecb3e4fe1e81de5d8f780dd372802858d756" dependencies = [ "diesel_table_macro_syntax", "dsl_auto_type", @@ -1606,9 +1640,9 @@ dependencies = [ [[package]] name = "diesel_migrations" -version = "2.3.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745fd255645f0f1135f9ec55c7b00e0882192af9683ab4731e4bba3da82b8f9c" +checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6" dependencies = [ "diesel", "migrations_internals", @@ -1617,9 +1651,9 @@ dependencies = [ [[package]] name = "diesel_table_macro_syntax" -version = "0.3.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe2444076b48641147115697648dc743c2c00b61adade0f01ce67133c7babe8c" +checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ "syn 2.0.117", ] @@ -1653,19 +1687,13 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" -[[package]] -name = "downcast-rs" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" - [[package]] name = "dsl_auto_type" -version = "0.2.0" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd122633e4bef06db27737f21d3738fb89c8f6d5360d6d9d7635dda142a7757e" +checksum = "139ae9aca7527f85f26dd76483eb38533fd84bd571065da1739656ef71c5ff5b" dependencies = [ - "darling", + "darling 0.20.11", "either", "heck 0.5.0", "proc-macro2", @@ -3010,9 +3038,9 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "migrations_internals" -version = "2.3.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c791ecdf977c99f45f23280405d7723727470f6689a5e6dbf513ac547ae10d" +checksum = "3bda1634d70d5bd53553cf15dca9842a396e8c799982a3ad22998dc44d961f24" dependencies = [ "serde", "toml", @@ -3020,9 +3048,9 @@ dependencies = [ [[package]] name = "migrations_macros" -version = "2.3.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36fc5ac76be324cfd2d3f2cf0fdf5d5d3c4f14ed8aaebadb09e304ba42282703" +checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd" dependencies = [ "migrations_internals", "proc-macro2", @@ -4659,7 +4687,7 @@ version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" dependencies = [ - "darling", + "darling 0.21.3", "proc-macro2", "quote", "syn 2.0.117", @@ -5112,6 +5140,7 @@ dependencies = [ "thoth-api-server", "thoth-errors", "thoth-export-server", + "thoth-oai-server", "tokio", "zitadel", ] @@ -5238,6 +5267,27 @@ dependencies = [ "xml-rs", ] +[[package]] +name = "thoth-oai-server" +version = "0.13.16" +dependencies = [ + "actix-cors", + "actix-web", + "base64 0.22.1", + "chrono", + "env_logger", + "log", + "quick-xml", + "reqwest 0.12.28", + "serde", + "serde_json", + "thoth-api", + "thoth-client", + "thoth-errors", + "uuid", + "xml-rs", +] + [[package]] name = "time" version = "0.3.47" @@ -5367,10 +5417,12 @@ version = "0.9.12+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863" dependencies = [ + "indexmap 2.13.0", "serde_core", "serde_spanned", "toml_datetime", "toml_parser", + "toml_writer", "winnow", ] @@ -5392,6 +5444,12 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_writer" +version = "1.0.6+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" + [[package]] name = "tonic" version = "0.12.3" diff --git a/Cargo.toml b/Cargo.toml index 8d12e9d5a..8df50732a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,13 +12,21 @@ readme = "README.md" maintenance = { status = "actively-developed" } [workspace] -members = ["thoth-api", "thoth-api-server", "thoth-client", "thoth-errors", "thoth-export-server"] +members = [ + "thoth-api", + "thoth-api-server", + "thoth-client", + "thoth-errors", + "thoth-export-server", + "thoth-oai-server", +] [dependencies] thoth-api = { version = "=0.13.16", path = "thoth-api", features = ["backend"] } thoth-api-server = { version = "=0.13.16", path = "thoth-api-server" } thoth-errors = { version = "=0.13.16", path = "thoth-errors" } thoth-export-server = { version = "=0.13.16", path = "thoth-export-server" } +thoth-oai-server = { version = "=0.13.16", path = "thoth-oai-server" } base64 = "0.22.1" clap = { version = "4.5.32", features = ["cargo", "env"] } dialoguer = { version = "0.11.0", features = ["password"] } diff --git a/Makefile b/Makefile index a073d2a30..94bec8ca6 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ run-zitadel \ run-graphql-api \ run-export-api \ + run-oai-api \ build \ test \ check \ @@ -31,6 +32,7 @@ help: @echo " run-zitadel Start Zitadel (docker)" @echo " run-graphql-api Run GraphQL API (cargo)" @echo " run-export-api Run export API (cargo)" + @echo " run-oai-api Run OAI-PMH API (cargo)" @echo " build Build the workspace" @echo " test Run tests" @echo " coverage Run test coverage (cargo llvm-cov)" @@ -59,6 +61,9 @@ run-graphql-api: build run-export-api: build RUST_BACKTRACE=1 cargo run start export-api +run-oai-api: build + RUST_BACKTRACE=1 cargo run start oai-api + build: cargo build -vv @@ -89,4 +94,3 @@ migration: mkdir -p $$dir; \ touch $$dir/up.sql; \ touch $$dir/down.sql; - diff --git a/src/bin/arguments/mod.rs b/src/bin/arguments/mod.rs index 236948101..98d939b35 100644 --- a/src/bin/arguments/mod.rs +++ b/src/bin/arguments/mod.rs @@ -85,6 +85,17 @@ pub fn export_url() -> Arg { .num_args(1) } +pub fn oai_url() -> Arg { + Arg::new("oai-url") + .short('o') + .long("oai-url") + .value_name("THOTH_OAI_API") + .env("THOTH_OAI_API") + .default_value("http://localhost:8383") + .help("Thoth OAI-PMH API's, public facing, root URL.") + .num_args(1) +} + pub fn zitadel_url() -> Arg { Arg::new("zitadel-url") .short('z') diff --git a/src/bin/commands/start.rs b/src/bin/commands/start.rs index c557c857d..4061b42f6 100644 --- a/src/bin/commands/start.rs +++ b/src/bin/commands/start.rs @@ -1,7 +1,7 @@ use crate::arguments; use clap::{ArgMatches, Command}; use lazy_static::lazy_static; -use thoth::{api_server, errors::ThothResult, export_server}; +use thoth::{api_server, errors::ThothResult, export_server, oai_server}; lazy_static! { pub(crate) static ref COMMAND: Command = Command::new("start") @@ -33,6 +33,17 @@ lazy_static! { .arg(arguments::keep_alive("EXPORT_API_KEEP_ALIVE")) .arg(arguments::export_url()) .arg(arguments::gql_endpoint()), + ) + .subcommand( + Command::new("oai-api") + .about("Start the thoth OAI-PMH API") + .arg(arguments::host("OAI_API_HOST")) + .arg(arguments::port("8383", "OAI_API_PORT")) + .arg(arguments::threads("OAI_API_THREADS")) + .arg(arguments::keep_alive("OAI_API_KEEP_ALIVE")) + .arg(arguments::oai_url()) + .arg(arguments::gql_endpoint()) + .arg(arguments::export_url()), ); } @@ -98,3 +109,30 @@ pub fn export_api(arguments: &ArgMatches) -> ThothResult<()> { ) .map_err(|e| e.into()) } + +pub fn oai_api(arguments: &ArgMatches) -> ThothResult<()> { + let host = arguments.get_one::("host").unwrap().to_owned(); + let port = arguments.get_one::("port").unwrap().to_owned(); + let threads = *arguments.get_one::("threads").unwrap(); + let keep_alive = *arguments.get_one::("keep-alive").unwrap(); + let public_url = arguments.get_one::("oai-url").unwrap().to_owned(); + let gql_endpoint = arguments + .get_one::("gql-endpoint") + .unwrap() + .to_owned(); + let export_url = arguments + .get_one::("export-url") + .unwrap() + .to_owned(); + + oai_server( + host, + port, + threads, + keep_alive, + public_url, + gql_endpoint, + export_url, + ) + .map_err(|e| e.into()) +} diff --git a/src/bin/thoth.rs b/src/bin/thoth.rs index 6ee60f80c..4db5fe212 100644 --- a/src/bin/thoth.rs +++ b/src/bin/thoth.rs @@ -23,6 +23,7 @@ fn main() -> thoth::errors::ThothResult<()> { Some(("start", start_arguments)) => match start_arguments.subcommand() { Some(("graphql-api", arguments)) => commands::start::graphql_api(arguments), Some(("export-api", arguments)) => commands::start::export_api(arguments), + Some(("oai-api", arguments)) => commands::start::oai_api(arguments), _ => unreachable!(), }, Some(("migrate", arguments)) => commands::migrate(arguments), diff --git a/src/lib.rs b/src/lib.rs index 10b035ce2..5ef4da6b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,3 +2,4 @@ pub use thoth_api as api; pub use thoth_api_server::start_server as api_server; pub use thoth_errors as errors; pub use thoth_export_server::{start_server as export_server, ALL_SPECIFICATIONS}; +pub use thoth_oai_server::start_server as oai_server; diff --git a/thoth-api/Cargo.toml b/thoth-api/Cargo.toml index 601af33eb..e3de6967d 100644 --- a/thoth-api/Cargo.toml +++ b/thoth-api/Cargo.toml @@ -38,10 +38,10 @@ argon2rs = { version = "0.2.5", optional = true } isbn = "0.6.0" chrono = { version = "0.4.40", features = ["serde"] } deadpool-redis = { version = "0.20.0", optional = true } -diesel = { version = "2.2.8", features = ["postgres", "uuid", "chrono", "r2d2", "64-column-tables", "serde_json"], optional = true } -diesel-derive-enum = { version = "2.1.0", features = ["postgres"], optional = true } -diesel-derive-newtype = "2.1.2" -diesel_migrations = { version = "2.2.0", features = ["postgres"], optional = true } +diesel = { version = "=2.2.8", features = ["postgres", "uuid", "chrono", "r2d2", "64-column-tables", "serde_json"], optional = true } +diesel-derive-enum = { version = "=2.1.0", features = ["postgres"], optional = true } +diesel-derive-newtype = "=2.1.2" +diesel_migrations = { version = "=2.2.0", features = ["postgres"], optional = true } dotenv = "0.15.0" futures = { version = "0.3.31", optional = true } jsonwebtoken = { version = "9.3.1", optional = true } diff --git a/thoth-api/src/model/work_relation/crud.rs b/thoth-api/src/model/work_relation/crud.rs index 6fbe565dd..27c45b099 100644 --- a/thoth-api/src/model/work_relation/crud.rs +++ b/thoth-api/src/model/work_relation/crud.rs @@ -5,8 +5,8 @@ use super::{ use crate::model::{Crud, DbInsert, HistoryEntry, PublisherId, Reorder}; use crate::schema::{work_relation, work_relation_history}; use diesel::{ - dsl::max, sql_query, sql_types::Text, BoolExpressionMethods, Connection, ExpressionMethods, - QueryDsl, RunQueryDsl, + sql_query, sql_types::Text, BoolExpressionMethods, Connection, ExpressionMethods, QueryDsl, + RunQueryDsl, }; use thoth_errors::{ThothError, ThothResult}; use uuid::Uuid; @@ -142,7 +142,7 @@ impl Crud for WorkRelation { // This will return `None` if no records with this work and type already exist. let max_inverse_ordinal = work_relation::table - .select(max(work_relation::relation_ordinal)) + .select(diesel::dsl::max(work_relation::relation_ordinal)) .filter(work_relation::relator_work_id.eq(data.related_work_id).and( work_relation::relation_type.eq(data.relation_type.convert_to_inverse()), )) diff --git a/thoth-client/Cargo.toml b/thoth-client/Cargo.toml index 6ab8f94cb..4f958a7c4 100644 --- a/thoth-client/Cargo.toml +++ b/thoth-client/Cargo.toml @@ -10,7 +10,7 @@ readme = "README.md" build = "build.rs" [dependencies] -thoth-api = {version = "=0.13.16", path = "../thoth-api" } +thoth-api = {version = "=0.13.16", path = "../thoth-api", features = ["backend"] } thoth-errors = {version = "=0.13.16", path = "../thoth-errors" } graphql_client = "0.14.0" chrono = { version = "0.4.40", features = ["serde"] } diff --git a/thoth-client/assets/queries.graphql b/thoth-client/assets/queries.graphql index 68d8b6bbd..a858935c6 100644 --- a/thoth-client/assets/queries.graphql +++ b/thoth-client/assets/queries.graphql @@ -34,8 +34,14 @@ fragment Funding on Funding { } } +fragment PublisherFields on Publisher { + publisherId + publisherName +} + fragment Work on Work { workId + updatedAtWithRelations workStatus workType reference @@ -334,3 +340,91 @@ query WorksLastUpdatedQuery( updatedAtWithRelations } } + +query OaiWorksQuery( + $limit: Int!, + $offset: Int!, + $publishers: [Uuid!], + $abstractsLimit: Int!, + $issuesLimit: Int!, + $languagesLimit: Int!, + $publicationsLimit: Int!, + $subjectsLimit: Int!, + $titlesLimit: Int!, + $fundingsLimit: Int!, + $relationsLimit: Int!, + $referencesLimit: Int! +) { + works( + limit: $limit, + offset: $offset, + publishers: $publishers, + workStatuses: [ACTIVE], + order: {field: UPDATED_AT_WITH_RELATIONS, direction: DESC} + ) { + ...Work + } +} + +query OaiBooksQuery( + $limit: Int!, + $offset: Int!, + $publishers: [Uuid!], + $abstractsLimit: Int!, + $issuesLimit: Int!, + $languagesLimit: Int!, + $publicationsLimit: Int!, + $subjectsLimit: Int!, + $titlesLimit: Int!, + $fundingsLimit: Int!, + $relationsLimit: Int!, + $referencesLimit: Int! +) { + books( + limit: $limit, + offset: $offset, + publishers: $publishers, + workStatuses: [ACTIVE], + order: {field: UPDATED_AT_WITH_RELATIONS, direction: DESC} + ) { + ...Work + } +} + +query OaiWorkCountQuery( + $publishers: [Uuid!] +) { + workCount(publishers: $publishers, workStatuses: [ACTIVE]) +} + +query OaiBookCountQuery( + $publishers: [Uuid!] +) { + bookCount(publishers: $publishers, workStatuses: [ACTIVE]) +} + +query OaiLatestWorksUpdatedQuery { + works( + workStatuses: [ACTIVE], + limit: 1, + order: {field: UPDATED_AT_WITH_RELATIONS, direction: DESC} + ) { + updatedAtWithRelations + } +} + +query OaiEarliestWorksUpdatedQuery { + works( + workStatuses: [ACTIVE], + limit: 1, + order: {field: UPDATED_AT_WITH_RELATIONS, direction: ASC} + ) { + updatedAtWithRelations + } +} + +query PublishersQuery { + publishers(limit: 10000) { + ...PublisherFields + } +} diff --git a/thoth-client/src/lib.rs b/thoth-client/src/lib.rs index 6ac9ce99e..5e4386940 100644 --- a/thoth-client/src/lib.rs +++ b/thoth-client/src/lib.rs @@ -7,8 +7,12 @@ pub use crate::parameters::QueryParameters; use crate::parameters::{WorkQueryVariables, WorksQueryVariables}; pub use crate::queries::work_query::*; use crate::queries::{ + oai_book_count_query, oai_books_query, oai_earliest_works_updated_query, + oai_latest_works_updated_query, oai_work_count_query, oai_works_query, publishers_query, work_count_query, work_last_updated_query, work_query, works_last_updated_query, works_query, - WorkCountQuery, WorkLastUpdatedQuery, WorkQuery, WorksLastUpdatedQuery, WorksQuery, + OaiBookCountQuery, OaiBooksQuery, OaiEarliestWorksUpdatedQuery, OaiLatestWorksUpdatedQuery, + OaiWorkCountQuery, OaiWorksQuery, PublishersQuery, WorkCountQuery, WorkLastUpdatedQuery, + WorkQuery, WorksLastUpdatedQuery, WorksQuery, }; pub use chrono::NaiveDate; use graphql_client::GraphQLQuery; @@ -23,6 +27,12 @@ use thoth_api::model::Timestamp; use thoth_errors::{ThothError, ThothResult}; use uuid::Uuid; +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Publisher { + pub publisher_id: Uuid, + pub publisher_name: String, +} + /// Maximum number of allowed request retries attempts. const MAX_REQUEST_RETRIES: u32 = 5; @@ -268,4 +278,117 @@ impl ThothClient { None => Err(ThothError::EntityNotFound), } } + + pub async fn get_oai_works( + &self, + publishers: Option>, + limit: i64, + offset: i64, + parameters: QueryParameters, + ) -> ThothResult> { + let variables: oai_works_query::Variables = + WorksQueryVariables::new(publishers, limit, offset, parameters).into(); + let request_body = OaiWorksQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.works.into_iter().map(Into::into).collect()), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_books( + &self, + publishers: Option>, + limit: i64, + offset: i64, + parameters: QueryParameters, + ) -> ThothResult> { + let variables: oai_books_query::Variables = + WorksQueryVariables::new(publishers, limit, offset, parameters).into(); + let request_body = OaiBooksQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.books.into_iter().map(Into::into).collect()), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_work_count(&self, publishers: Option>) -> ThothResult { + let variables = oai_work_count_query::Variables { publishers }; + let request_body = OaiWorkCountQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.work_count), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_book_count(&self, publishers: Option>) -> ThothResult { + let variables = oai_book_count_query::Variables { publishers }; + let request_body = OaiBookCountQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.book_count), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_latest_works_updated(&self) -> ThothResult { + let request_body = + OaiLatestWorksUpdatedQuery::build_query(oai_latest_works_updated_query::Variables {}); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => data + .works + .first() + .map(|work| work.updated_at_with_relations) + .ok_or(ThothError::EntityNotFound), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_earliest_works_updated(&self) -> ThothResult { + let request_body = OaiEarliestWorksUpdatedQuery::build_query( + oai_earliest_works_updated_query::Variables {}, + ); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => data + .works + .first() + .map(|work| work.updated_at_with_relations) + .ok_or(ThothError::EntityNotFound), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_publishers(&self) -> ThothResult> { + let request_body = PublishersQuery::build_query(publishers_query::Variables {}); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data + .publishers + .into_iter() + .map(|publisher| Publisher { + publisher_id: publisher.publisher_id, + publisher_name: publisher.publisher_name, + }) + .collect()), + None => Err(ThothError::EntityNotFound), + } + } } diff --git a/thoth-client/src/parameters.rs b/thoth-client/src/parameters.rs index cdc4dfb1a..b802d05ac 100644 --- a/thoth-client/src/parameters.rs +++ b/thoth-client/src/parameters.rs @@ -1,4 +1,4 @@ -use crate::queries::{work_query, works_query}; +use crate::queries::{oai_books_query, oai_works_query, work_query, works_query}; use uuid::Uuid; /// A set of booleans to toggle directives in the GraphQL queries @@ -288,10 +288,120 @@ impl From for works_query::Variables { } } +impl From for oai_works_query::Variables { + fn from(v: WorksQueryVariables) -> Self { + oai_works_query::Variables { + publishers: v.publishers, + limit: v.limit, + offset: v.offset, + abstracts_limit: if v.parameters.with_abstracts { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + issues_limit: if v.parameters.with_issues { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + languages_limit: if v.parameters.with_languages { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + publications_limit: if v.parameters.with_publications { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + subjects_limit: if v.parameters.with_subjects { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + fundings_limit: if v.parameters.with_fundings { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + relations_limit: if v.parameters.with_relations { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + references_limit: if v.parameters.with_references { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + titles_limit: if v.parameters.with_titles { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + } + } +} + +impl From for oai_books_query::Variables { + fn from(v: WorksQueryVariables) -> Self { + oai_books_query::Variables { + publishers: v.publishers, + limit: v.limit, + offset: v.offset, + abstracts_limit: if v.parameters.with_abstracts { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + issues_limit: if v.parameters.with_issues { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + languages_limit: if v.parameters.with_languages { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + publications_limit: if v.parameters.with_publications { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + subjects_limit: if v.parameters.with_subjects { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + fundings_limit: if v.parameters.with_fundings { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + relations_limit: if v.parameters.with_relations { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + references_limit: if v.parameters.with_references { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + titles_limit: if v.parameters.with_titles { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + } + } +} + #[cfg(test)] mod tests { use super::*; - use crate::queries::{work_query, works_query}; + use crate::queries::{oai_books_query, oai_works_query, work_query, works_query}; #[test] fn test_default_query_parameters() { @@ -496,4 +606,63 @@ mod tests { } ); } + + #[test] + fn test_convert_parameters_to_oai_works_query_variables() { + let publisher_id: Uuid = Uuid::parse_str("00000000-0000-0000-AAAA-000000000001").unwrap(); + let publishers = Some(vec![publisher_id]); + let parameters = QueryParameters::new() + .with_issues() + .with_languages() + .with_publications(); + + let variables: oai_works_query::Variables = + WorksQueryVariables::new(publishers.clone(), 50, 25, parameters).into(); + + assert_eq!( + variables, + oai_works_query::Variables { + publishers: publishers.clone(), + limit: 50, + offset: 25, + abstracts_limit: FILTER_INCLUDE_CANONICAL, + issues_limit: FILTER_INCLUDE_ALL, + languages_limit: FILTER_INCLUDE_ALL, + publications_limit: FILTER_INCLUDE_ALL, + subjects_limit: FILTER_INCLUDE_NONE, + fundings_limit: FILTER_INCLUDE_NONE, + relations_limit: FILTER_INCLUDE_NONE, + references_limit: FILTER_INCLUDE_NONE, + titles_limit: FILTER_INCLUDE_CANONICAL, + } + ); + } + + #[test] + fn test_convert_parameters_to_oai_books_query_variables() { + let publisher_id: Uuid = Uuid::parse_str("00000000-0000-0000-AAAA-000000000001").unwrap(); + let publishers = Some(vec![publisher_id]); + let parameters = QueryParameters::new().with_all(); + + let variables: oai_books_query::Variables = + WorksQueryVariables::new(publishers.clone(), 10, 5, parameters).into(); + + assert_eq!( + variables, + oai_books_query::Variables { + publishers, + limit: 10, + offset: 5, + abstracts_limit: FILTER_INCLUDE_ALL, + issues_limit: FILTER_INCLUDE_ALL, + languages_limit: FILTER_INCLUDE_ALL, + publications_limit: FILTER_INCLUDE_ALL, + subjects_limit: FILTER_INCLUDE_ALL, + fundings_limit: FILTER_INCLUDE_ALL, + relations_limit: FILTER_INCLUDE_ALL, + references_limit: FILTER_INCLUDE_ALL, + titles_limit: FILTER_INCLUDE_ALL, + } + ); + } } diff --git a/thoth-client/src/queries.rs b/thoth-client/src/queries.rs index 6f25c9f18..993dea387 100644 --- a/thoth-client/src/queries.rs +++ b/thoth-client/src/queries.rs @@ -98,6 +98,69 @@ pub struct WorkLastUpdatedQuery; )] pub struct WorksLastUpdatedQuery; +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiWorksQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiBooksQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiWorkCountQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiBookCountQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiLatestWorksUpdatedQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiEarliestWorksUpdatedQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct PublishersQuery; + // Needed to set work_query::Work as the canonical struct for the shared fragment in the two queries // until https://github.com/graphql-rust/graphql-client/issues/312 gets fixed impl From for work_query::Work { @@ -107,6 +170,20 @@ impl From for work_query::Work { } } +impl From for work_query::Work { + fn from(w: oai_works_query::Work) -> Self { + let se = serde_json::to_string(&w).unwrap(); + serde_json::from_str(&se).unwrap() + } +} + +impl From for work_query::Work { + fn from(w: oai_books_query::Work) -> Self { + let se = serde_json::to_string(&w).unwrap(); + serde_json::from_str(&se).unwrap() + } +} + // As above: enables shared processing of parent Works and child RelatedWorks in doideposit format impl From for work_query::WorkRelationsRelatedWork { fn from(w: work_query::Work) -> Self { diff --git a/thoth-errors/Cargo.toml b/thoth-errors/Cargo.toml index 311a03009..46178d9dc 100644 --- a/thoth-errors/Cargo.toml +++ b/thoth-errors/Cargo.toml @@ -14,7 +14,7 @@ chrono = "0.4.40" csv = "1.3.1" deadpool-redis = "0.20.0" dialoguer = { version = "0.11.0", features = ["password"] } -diesel = "2.2.8" +diesel = { version = "=2.2.8", features = ["postgres", "r2d2"] } juniper = "0.16.1" marc = { version = "3.1.1", features = ["xml"] } phf = { version = "0.11", features = ["macros"] } diff --git a/thoth-export-server/Cargo.toml b/thoth-export-server/Cargo.toml index a150acde5..5cfceb5f1 100644 --- a/thoth-export-server/Cargo.toml +++ b/thoth-export-server/Cargo.toml @@ -10,7 +10,7 @@ readme = "README.md" build = "build.rs" [dependencies] -thoth-api = { version = "=0.13.16", path = "../thoth-api" } +thoth-api = { version = "=0.13.16", path = "../thoth-api", features = ["backend"] } thoth-errors = { version = "=0.13.16", path = "../thoth-errors" } thoth-client = { version = "=0.13.16", path = "../thoth-client" } actix-web = "4.10" diff --git a/thoth-export-server/src/bibtex/bibtex_thoth.rs b/thoth-export-server/src/bibtex/bibtex_thoth.rs index 8f871de00..69e805229 100644 --- a/thoth-export-server/src/bibtex/bibtex_thoth.rs +++ b/thoth-export-server/src/bibtex/bibtex_thoth.rs @@ -309,6 +309,7 @@ mod tests { }, ], work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/csv/csv_thoth.rs b/thoth-export-server/src/csv/csv_thoth.rs index 61c65b9eb..6f9790acb 100644 --- a/thoth-export-server/src/csv/csv_thoth.rs +++ b/thoth-export-server/src/csv/csv_thoth.rs @@ -535,6 +535,7 @@ mod tests { lazy_static! { static ref TEST_WORK: Work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/csv/kbart_oclc.rs b/thoth-export-server/src/csv/kbart_oclc.rs index cd378f723..9e0fb1de9 100644 --- a/thoth-export-server/src/csv/kbart_oclc.rs +++ b/thoth-export-server/src/csv/kbart_oclc.rs @@ -247,6 +247,10 @@ mod tests { fn test_kbart_oclc() { let mut test_work: Work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, // We must manually set full_title within this test framework, but // Thoth UI compiles it automatically from title + (optional) subtitle diff --git a/thoth-export-server/src/json/json_thoth.rs b/thoth-export-server/src/json/json_thoth.rs index 6e8192a7e..06cddad47 100644 --- a/thoth-export-server/src/json/json_thoth.rs +++ b/thoth-export-server/src/json/json_thoth.rs @@ -77,6 +77,7 @@ mod tests { lazy_static! { static ref TEST_WORK: Work = Work { work_id: Uuid::from_str("00000000-0000-0000-aaaa-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-cccc-000000000001").unwrap(), @@ -517,6 +518,7 @@ mod tests { const TEST_RESULT: &str = r#" "workId": "00000000-0000-0000-aaaa-000000000001", + "updatedAtWithRelations": "2024-01-01T00:00:00Z", "workStatus": "ACTIVE", "workType": "MONOGRAPH", "reference": null, diff --git a/thoth-export-server/src/marc21/marc21record_thoth.rs b/thoth-export-server/src/marc21/marc21record_thoth.rs index efc9bfea5..db3580725 100644 --- a/thoth-export-server/src/marc21/marc21record_thoth.rs +++ b/thoth-export-server/src/marc21/marc21record_thoth.rs @@ -807,6 +807,10 @@ pub(crate) mod tests { pub(crate) fn test_work() -> Work { Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs index 2a0d26ffc..92de52a5d 100644 --- a/thoth-export-server/src/xml/doideposit_crossref.rs +++ b/thoth-export-server/src/xml/doideposit_crossref.rs @@ -1619,6 +1619,7 @@ mod tests { fn test_doideposit_crossref_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), @@ -2458,6 +2459,7 @@ mod tests { fn test_doideposit_crossref_isbns_workaround() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix21_ebsco_host.rs b/thoth-export-server/src/xml/onix21_ebsco_host.rs index 50883e410..6cb9a2eab 100644 --- a/thoth-export-server/src/xml/onix21_ebsco_host.rs +++ b/thoth-export-server/src/xml/onix21_ebsco_host.rs @@ -994,6 +994,10 @@ mod tests { fn test_onix21_ebsco_host_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix21_proquest_ebrary.rs b/thoth-export-server/src/xml/onix21_proquest_ebrary.rs index e68b32be6..3137ad7bd 100644 --- a/thoth-export-server/src/xml/onix21_proquest_ebrary.rs +++ b/thoth-export-server/src/xml/onix21_proquest_ebrary.rs @@ -972,6 +972,10 @@ mod tests { fn test_onix21_proquest_ebrary_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix31_thoth.rs b/thoth-export-server/src/xml/onix31_thoth.rs index e2856bae8..b379e4a94 100644 --- a/thoth-export-server/src/xml/onix31_thoth.rs +++ b/thoth-export-server/src/xml/onix31_thoth.rs @@ -2592,6 +2592,7 @@ mod tests { fn test_onix31_thoth_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix3_google_books.rs b/thoth-export-server/src/xml/onix3_google_books.rs index e397ad186..c2a1f4191 100644 --- a/thoth-export-server/src/xml/onix3_google_books.rs +++ b/thoth-export-server/src/xml/onix3_google_books.rs @@ -963,6 +963,10 @@ mod tests { fn test_onix3_google_books_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix3_jstor.rs b/thoth-export-server/src/xml/onix3_jstor.rs index e33e1b85b..6e8ba7af3 100644 --- a/thoth-export-server/src/xml/onix3_jstor.rs +++ b/thoth-export-server/src/xml/onix3_jstor.rs @@ -879,6 +879,10 @@ mod tests { fn test_onix3_jstor_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix3_oapen.rs b/thoth-export-server/src/xml/onix3_oapen.rs index 4daf568e4..7b637ef91 100644 --- a/thoth-export-server/src/xml/onix3_oapen.rs +++ b/thoth-export-server/src/xml/onix3_oapen.rs @@ -1098,6 +1098,10 @@ mod tests { fn test_onix3_oapen_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix3_overdrive.rs b/thoth-export-server/src/xml/onix3_overdrive.rs index cd34fb5e7..89459e9be 100644 --- a/thoth-export-server/src/xml/onix3_overdrive.rs +++ b/thoth-export-server/src/xml/onix3_overdrive.rs @@ -1203,6 +1203,10 @@ mod tests { fn test_onix3_overdrive_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix3_project_muse.rs b/thoth-export-server/src/xml/onix3_project_muse.rs index 8240ec64c..6143cb8b1 100644 --- a/thoth-export-server/src/xml/onix3_project_muse.rs +++ b/thoth-export-server/src/xml/onix3_project_muse.rs @@ -991,6 +991,10 @@ mod tests { fn test_onix3_projectmuse_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/onix3_thoth.rs b/thoth-export-server/src/xml/onix3_thoth.rs index 709edab88..ab2dffcc6 100644 --- a/thoth-export-server/src/xml/onix3_thoth.rs +++ b/thoth-export-server/src/xml/onix3_thoth.rs @@ -2344,6 +2344,7 @@ mod tests { fn test_onix3_thoth_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-oai-server/Cargo.toml b/thoth-oai-server/Cargo.toml new file mode 100644 index 000000000..2b1752e93 --- /dev/null +++ b/thoth-oai-server/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "thoth-oai-server" +version = "0.13.16" +authors = ["Javier Arias ", "Ross Higman "] +edition = "2021" +license = "Apache-2.0" +description = "Actix instance serving Thoth's OAI-PMH endpoints" +repository = "https://github.com/thoth-pub/thoth" +readme = "README.md" + +[dependencies] +thoth-api = { version = "=0.13.16", path = "../thoth-api", features = ["backend"] } +thoth-errors = { version = "=0.13.16", path = "../thoth-errors" } +thoth-client = { version = "=0.13.16", path = "../thoth-client" } +actix-cors = "0.7.1" +actix-web = "4.10" +base64 = "0.22.1" +chrono = { version = "0.4.40", features = ["serde"] } +env_logger = "0.11.7" +log = "0.4.26" +quick-xml = "0.36" +reqwest = { version = "0.12", features = ["json"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +uuid = { version = "1.16.0", features = ["serde"] } +xml-rs = "0.8.25" diff --git a/thoth-oai-server/README.md b/thoth-oai-server/README.md new file mode 100644 index 000000000..0712e6c76 --- /dev/null +++ b/thoth-oai-server/README.md @@ -0,0 +1,3 @@ +# Thoth OAI Server + +OAI-PMH server for the Thoth workspace. diff --git a/thoth-oai-server/assets/oai2.xsl b/thoth-oai-server/assets/oai2.xsl new file mode 100644 index 000000000..5f29642d8 --- /dev/null +++ b/thoth-oai-server/assets/oai2.xsl @@ -0,0 +1,707 @@ + + + + + + + + + + + + + + + +td.value { + vertical-align: top; + padding-left: 1em; + padding: 3px; +} +td.key { + background-color: #e0e0ff; + padding: 3px; + text-align: right; + border: 1px solid #c0c0c0; + white-space: nowrap; + font-weight: bold; + vertical-align: top; +} +.dcdata td.key { + background-color: #ffffe0; +} +body { + margin: 1em 2em 1em 2em; +} +h1, h2, h3 { + font-family: sans-serif; + clear: left; +} +h1 { + padding-bottom: 4px; + margin-bottom: 0px; +} +h2 { + margin-bottom: 0.5em; +} +h3 { + margin-bottom: 0.3em; + font-size: medium; +} +.link { + border: 1px outset #88f; + background-color: #c0c0ff; + padding: 1px 4px 1px 4px; + font-size: 80%; + text-decoration: none; + font-weight: bold; + font-family: sans-serif; + color: black; +} +.link:hover { + color: red; +} +.link:active { + color: red; + border: 1px inset #88f; + background-color: #a0a0df; +} +.oaiRecord, .oaiRecordTitle { + background-color: #f0f0ff; + border-style: solid; + border-color: #d0d0d0; +} +h2.oaiRecordTitle { + background-color: #e0e0ff; + font-size: medium; + font-weight: bold; + padding: 10px; + border-width: 2px 2px 0px 2px; + margin: 0px; +} +.oaiRecord { + margin-bottom: 3em; + border-width: 2px; + padding: 10px; +} + +.results { + margin-bottom: 1.5em; +} +ul.quicklinks { + margin-top: 2px; + padding: 4px; + text-align: left; + border-bottom: 2px solid #ccc; + border-top: 2px solid #ccc; + clear: left; +} +ul.quicklinks li { + font-size: 80%; + display: inline; + list-stlye: none; + font-family: sans-serif; +} +p.intro { + font-size: 80%; +} + + + + + + + + + Thoth OAI 2.0 + + + +
+

Thoth OAI 2.0

+ +

You are viewing an HTML version of the XML OAI response. To see the underlying XML use your web browsers view source option. More information about this XSLT is at the bottom of the page.

+
+ +
+ +

About the XSLT

+

An XSLT file has converted the OAI-PMH 2.0 responses into XHTML which looks nice in a browser which supports XSLT such as Mozilla, Firebird and Internet Explorer. The XSLT file was created by Christopher Gutteridge at the University of Southampton as part of the GNU EPrints system, and is freely redistributable under the GPL.

If you want to use the XSL file on your own OAI interface you may but due to the way XSLT works you must install the XSL file on the same server as the OAI script, you can't just link to this copy.

+
+ + +
+ + + + + + + + + + + + +
Datestamp of response
Request URL
+ + + +

OAI Error(s)

+

The request could not be completed due to the following error or errors.

+
+ +
+
+ +

Request was of type .

+
+ + + + + + +
+
+
+
+ + + + + + + + +
Error Code
+

+
+ + + + + + + + + + + + + + + + + + +
Repository Name
Base URL
Protocol Version
Earliest Datestamp
Deleted Record Policy
Granularity
+ + +
+ + + Admin Email + + + + + + +

Unsupported Description Type

+

The XSL currently does not support this type of description.

+
+ +
+
+ + + + + +

OAI-Identifier

+ + + + + + + + + +
Scheme
Repository Identifier
Delimiter
Sample OAI Identifier
+
+ + + + + +

EPrints Description

+ +

Content

+ +
+ +

Submission Policy

+ +
+

Metadata Policy

+ +

Data Policy

+ + +
+ + + +

+
+ +
+
+
+ + +

Comment

+
+
+ + + + + +

Friends

+
    + +
+
+ + +
  • + +Identify
  • +
    + + + + + +

    Branding

    + + +
    + + +

    Icon

    + + + {br:title} + + + {br:title} + + +
    + + +

    Metadata Rendering Rule

    + + + + + + + +
    URL
    Namespace
    Mime Type
    +
    + + + + + + +

    Gateway Information

    + + + + + + + + + + + + + + +
    Source
    Description
    URL
    Notes
    +
    + + + Admin + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Set

    + + + + +
    setName
    +
    + + + + + +
    +

    ListMetadataFormats

    + + +
    +
    +
    + + +

    Metadata Format

    + + + + + + + +
    metadataPrefix
    metadataNamespace
    schema
    +
    + + + + + + + + + +

    This is a list of metadata formats available for the record "". Use these links to view the metadata: + + + +

    +
    + +

    This is a list of metadata formats available from this archive.

    +
    +
    +
    + + + + +

    OAI Record:

    +
    + + + +
    +
    + + +

    OAI Record Header

    + + + + + + +
    OAI Identifier + + oai_dc + oai_openaire + marcxml + formats +
    Datestamp
    + +

    This record has been deleted.

    +
    +
    + + + +

    "about" part of record container not supported by the XSL

    +
    + + +   + + + + + + + + + + setSpec + + Identifiers + Records + + + + + + + + +

    There are more results.

    + + + + + + + + + + + +
    expirationDate
    completeListSize
    cursor
    resumptionToken: Resume
    +
    + + + + +

    Unknown Metadata Format

    +
    + +
    +
    + + + + +
    +

    MARC21 (marcxml)

    + + +
    +
    +
    + + + + +
    +

    OpenAIRE Metadata (oai_openaire)

    + + +
    +
    +
    + + + + +
    +

    Dublin Core Metadata (oai_dc)

    + + +
    +
    +
    + + +Title + + +Author or Creator + + +Subject and Keywords + + +Description + + +Publisher + + +Other Contributor + + +Date + + +Resource Type + + +Format + + +Resource Identifier + + +Source + + +Language + + +Relation + + + + + URL + URL not shown as it is very long. + + + + + + + + + + + + + +Coverage + + +Rights Management + + + + +
    + <></> +
    +
    + + + + + ="" + + + +.xmlSource { + font-family: monospace; + line-height: 1.1rem; + border: solid #c0c0a0 1px; + background-color: #fff; + padding: 2em 2em 2em 0em; +} +.xmlBlock { + padding-left: 2em; +} +.xmlTagName { + color: #800000; + font-weight: bold; +} +.xmlAttrName { + font-weight: bold; +} +.xmlAttrValue { + color: #0000c0; +} + + +
    + diff --git a/thoth-oai-server/src/lib.rs b/thoth-oai-server/src/lib.rs new file mode 100644 index 000000000..34962088e --- /dev/null +++ b/thoth-oai-server/src/lib.rs @@ -0,0 +1,572 @@ +mod metadata; +mod service; + +use std::{collections::HashMap, io, time::Duration}; + +use actix_cors::Cors; +use actix_web::{middleware::Logger, web, App, HttpRequest, HttpResponse, HttpServer}; +use chrono::Utc; +use quick_xml::escape::escape; +use service::{ + MetadataPrefix, OaiService, RecordPage, ResumptionToken, ADMIN_EMAIL, RECORD_PREFIX, + REPOSITORY_NAME, SAMPLE_ID, +}; +use thoth_errors::ThothError; +use uuid::Uuid; + +const LOG_FORMAT: &str = r#"%{r}a %a "%r" %s %b "%{Referer}i" "%{User-Agent}i" %T"#; +const XSL_STYLESHEET: &str = include_str!("../assets/oai2.xsl"); + +#[derive(Clone)] +struct AppState { + service: OaiService, +} + +#[derive(Debug)] +struct ProtocolError { + code: &'static str, + message: String, +} + +enum HandlerError { + Protocol(ProtocolError), + Internal(ThothError), +} + +type HandlerResult = Result; + +impl From for HandlerError { + fn from(value: ProtocolError) -> Self { + Self::Protocol(value) + } +} + +async fn index() -> HttpResponse { + HttpResponse::Found() + .append_header(("Location", "/oai")) + .finish() +} + +async fn stylesheet() -> HttpResponse { + HttpResponse::Ok() + .content_type("text/xsl; charset=utf-8") + .body(XSL_STYLESHEET) +} + +async fn oai( + request: HttpRequest, + params: web::Query>, + state: web::Data, +) -> HttpResponse { + let params = params.into_inner(); + match handle_oai_request(&request, ¶ms, &state.service).await { + Ok(body) => xml_response(success_document(&state.service, ¶ms, &body)), + Err(HandlerError::Protocol(error)) => xml_response(error_document( + &state.service, + ¶ms, + error.code, + &error.message, + )), + Err(HandlerError::Internal(error)) => { + log::error!("OAI request failed: {error}"); + HttpResponse::InternalServerError() + .content_type("text/plain; charset=utf-8") + .body("Internal Server Error") + } + } +} + +async fn not_found() -> HttpResponse { + HttpResponse::NotFound() + .content_type("text/html; charset=utf-8") + .body( + r#" + + + 404 - Page Not Found + + + +

    404 - Page Not Found

    +

    The requested page was not found.

    +

    OAI-PMH Interface

    + +"#, + ) +} + +async fn handle_oai_request( + _request: &HttpRequest, + params: &HashMap, + service: &OaiService, +) -> HandlerResult { + let verb = params + .get("verb") + .map(String::as_str) + .ok_or_else(|| bad_verb("Missing verb parameter"))?; + + match verb { + "Identify" => { + require_only(params, &["verb"])?; + let earliest = service.earliest().await.map_err(HandlerError::Internal)?; + let latest = service.latest().await.map_err(HandlerError::Internal)?; + Ok(render_identify(service, earliest, latest)) + } + "ListMetadataFormats" => { + require_only(params, &["verb", "identifier"])?; + if let Some(identifier) = params.get("identifier") { + let work_id = parse_identifier(identifier)?; + service + .get_record(work_id, MetadataPrefix::OaiDc) + .await + .map_err(map_get_record_error(MetadataPrefix::OaiDc))?; + } + Ok(render_list_metadata_formats()) + } + "ListSets" => { + require_only(params, &["verb"])?; + let sets = service.list_sets().await.map_err(HandlerError::Internal)?; + Ok(render_list_sets(&sets)) + } + "GetRecord" => { + require_only(params, &["verb", "identifier", "metadataPrefix"])?; + let identifier = params + .get("identifier") + .ok_or_else(|| bad_argument("Missing identifier parameter"))?; + let metadata_prefix = params + .get("metadataPrefix") + .ok_or_else(|| bad_argument("Missing metadataPrefix parameter"))?; + let work_id = parse_identifier(identifier)?; + let metadata_prefix = parse_metadata_prefix(metadata_prefix)?; + let work = service + .get_record(work_id, metadata_prefix) + .await + .map_err(map_get_record_error(metadata_prefix))?; + Ok(render_get_record(service, &work, metadata_prefix).await?) + } + "ListIdentifiers" => { + validate_list_verb(params)?; + let token = parse_list_token(params, true)?; + let page = service + .list_records(token.metadata_prefix, token.set.clone(), token.offset, true) + .await + .map_err(map_list_error)?; + if page.records.is_empty() { + return Err(HandlerError::Protocol(no_records_match())); + } + Ok(render_list_identifiers(&page)) + } + "ListRecords" => { + validate_list_verb(params)?; + let token = parse_list_token(params, false)?; + let page = service + .list_records( + token.metadata_prefix, + token.set.clone(), + token.offset, + false, + ) + .await + .map_err(map_list_error)?; + if page.records.is_empty() { + return Err(HandlerError::Protocol(no_records_match())); + } + Ok(render_list_records(service, &page, token.metadata_prefix).await?) + } + other => Err(HandlerError::Protocol(bad_verb(&format!( + "Unknown verb {other}" + )))), + } +} + +fn render_identify( + service: &OaiService, + earliest: thoth_api::model::Timestamp, + latest: thoth_api::model::Timestamp, +) -> String { + format!( + "\ +{}\ +{}\ +2.0\ +{}\ +{}\ +no\ +YYYY-MM-DDThh:mm:ssZ\ +\ +\ +oai\ +thoth.pub\ +:\ +{}:{}\ +\ +\ +\ +\ +{}\ +\ +\ +", + xml_escape(REPOSITORY_NAME), + xml_escape(&service.repository_url()), + xml_escape(ADMIN_EMAIL), + xml_escape(&OaiService::timestamp_xml(earliest)), + RECORD_PREFIX, + SAMPLE_ID, + xml_escape(&OaiService::timestamp_xml(latest)), + ) +} + +fn render_list_metadata_formats() -> String { + let prefixes = [ + MetadataPrefix::OaiDc, + MetadataPrefix::OaiOpenaire, + MetadataPrefix::MarcXml, + ]; + let mut xml = String::from(""); + for prefix in prefixes { + xml.push_str(""); + push_text_element(&mut xml, "metadataPrefix", prefix.as_str()); + push_text_element(&mut xml, "schema", prefix.schema()); + push_text_element(&mut xml, "metadataNamespace", prefix.namespace()); + xml.push_str(""); + } + xml.push_str(""); + xml +} + +fn render_list_sets(sets: &[service::SetRecord]) -> String { + let mut xml = String::from(""); + for set in sets { + xml.push_str(""); + push_text_element(&mut xml, "setSpec", &set.spec); + push_text_element(&mut xml, "setName", &set.name); + xml.push_str(""); + } + xml.push_str(""); + xml +} + +async fn render_get_record( + service: &OaiService, + work: &thoth_client::Work, + metadata_prefix: MetadataPrefix, +) -> HandlerResult { + let mut xml = String::from(""); + xml.push_str(&render_record_xml(service, work, metadata_prefix).await?); + xml.push_str(""); + Ok(xml) +} + +fn render_list_identifiers(page: &RecordPage) -> String { + let mut xml = String::from(""); + for work in &page.records { + xml.push_str(&render_header_xml(work)); + } + if let Some(token) = &page.next_token { + xml.push_str(&render_resumption_token( + token, + page.cursor, + page.complete_list_size, + )); + } + xml.push_str(""); + xml +} + +async fn render_list_records( + service: &OaiService, + page: &RecordPage, + metadata_prefix: MetadataPrefix, +) -> HandlerResult { + let mut xml = String::from(""); + for work in &page.records { + xml.push_str(&render_record_xml(service, work, metadata_prefix).await?); + } + if let Some(token) = &page.next_token { + xml.push_str(&render_resumption_token( + token, + page.cursor, + page.complete_list_size, + )); + } + xml.push_str(""); + Ok(xml) +} + +async fn render_record_xml( + service: &OaiService, + work: &thoth_client::Work, + metadata_prefix: MetadataPrefix, +) -> HandlerResult { + let metadata = match metadata_prefix { + MetadataPrefix::OaiDc => metadata::map_oai_dc(work).map_err(HandlerError::Internal)?, + MetadataPrefix::OaiOpenaire => { + metadata::map_oai_openaire(work).map_err(HandlerError::Internal)? + } + MetadataPrefix::MarcXml => service + .get_marcxml_record(work.work_id) + .await + .map_err(map_get_record_error(metadata_prefix))?, + }; + + Ok(format!( + "{}{}", + render_header_xml(work), + metadata + )) +} + +fn render_header_xml(work: &thoth_client::Work) -> String { + let set_spec = OaiService::set_spec(&work.imprint.publisher.publisher_name); + format!( + "
    \ +{}\ +{}\ +{}\ +
    ", + xml_escape(&OaiService::oai_identifier(work.work_id)), + xml_escape(&OaiService::timestamp_xml(work.updated_at_with_relations)), + xml_escape(&set_spec), + ) +} + +fn render_resumption_token(token: &str, cursor: i64, complete_list_size: i64) -> String { + format!( + "{}", + cursor, + complete_list_size, + xml_escape(token) + ) +} + +fn validate_list_verb(params: &HashMap) -> HandlerResult<()> { + require_only( + params, + &[ + "verb", + "metadataPrefix", + "set", + "resumptionToken", + "from", + "until", + ], + ) +} + +fn parse_list_token( + params: &HashMap, + identifiers_only: bool, +) -> HandlerResult { + if let Some(value) = params.get("resumptionToken") { + if params.len() != 2 { + return Err( + bad_argument("resumptionToken cannot be combined with other arguments").into(), + ); + } + let token = OaiService::decode_resumption_token(value).map_err(|_| ProtocolError { + code: "badResumptionToken", + message: "Invalid resumptionToken".to_string(), + })?; + if token.identifiers_only != identifiers_only { + return Err(ProtocolError { + code: "badResumptionToken", + message: "resumptionToken does not match the request verb".to_string(), + } + .into()); + } + return Ok(token); + } + + let metadata_prefix = params + .get("metadataPrefix") + .ok_or_else(|| bad_argument("Missing metadataPrefix parameter"))?; + Ok(ResumptionToken { + offset: 0, + metadata_prefix: parse_metadata_prefix(metadata_prefix)?, + set: params.get("set").cloned(), + identifiers_only, + }) +} + +fn parse_metadata_prefix(value: &str) -> HandlerResult { + MetadataPrefix::try_from(value).map_err(|_| { + ProtocolError { + code: "cannotDisseminateFormat", + message: format!("Unsupported metadataPrefix {value}"), + } + .into() + }) +} + +fn parse_identifier(value: &str) -> HandlerResult { + OaiService::parse_oai_identifier(value).map_err(|_| bad_argument("Invalid identifier").into()) +} + +fn map_get_record_error( + metadata_prefix: MetadataPrefix, +) -> impl Fn(ThothError) -> HandlerError + Copy { + move |error| match error { + ThothError::EntityNotFound => HandlerError::Protocol(ProtocolError { + code: "idDoesNotExist", + message: "The requested identifier does not exist".to_string(), + }), + ThothError::IncompleteMetadataRecord(_, _) + | ThothError::InvalidMetadataSpecification(_) => HandlerError::Protocol(ProtocolError { + code: "cannotDisseminateFormat", + message: format!( + "Record cannot be disseminated as {}", + metadata_prefix.as_str() + ), + }), + other => HandlerError::Internal(other), + } +} + +fn map_list_error(error: ThothError) -> HandlerError { + match error { + ThothError::EntityNotFound => no_records_match().into(), + other => HandlerError::Internal(other), + } +} + +fn require_only(params: &HashMap, allowed: &[&str]) -> HandlerResult<()> { + if params.keys().all(|key| allowed.contains(&key.as_str())) { + Ok(()) + } else { + Err(bad_argument("The request included unsupported arguments").into()) + } +} + +fn bad_argument(message: &str) -> ProtocolError { + ProtocolError { + code: "badArgument", + message: message.to_string(), + } +} + +fn bad_verb(message: &str) -> ProtocolError { + ProtocolError { + code: "badVerb", + message: message.to_string(), + } +} + +fn no_records_match() -> ProtocolError { + ProtocolError { + code: "noRecordsMatch", + message: "The request matched no records".to_string(), + } +} + +fn success_document(service: &OaiService, params: &HashMap, body: &str) -> String { + format!( + "{}{}{}{}{}", + xml_declaration(), + stylesheet_pi(), + response_date(), + request_element(service, params), + body + ) +} + +fn error_document( + service: &OaiService, + params: &HashMap, + code: &str, + message: &str, +) -> String { + format!( + "{}{}{}{}{}", + xml_declaration(), + stylesheet_pi(), + response_date(), + request_element(service, params), + xml_escape(code), + xml_escape(message) + ) +} + +fn request_element(service: &OaiService, params: &HashMap) -> String { + let mut attrs = params.iter().collect::>(); + attrs.sort_by(|(left, _), (right, _)| left.cmp(right)); + let mut element = String::from("'); + element.push_str(&xml_escape(&service.repository_url())); + element.push_str(""); + element +} + +fn response_date() -> String { + Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string() +} + +fn xml_declaration() -> &'static str { + "" +} + +fn stylesheet_pi() -> &'static str { + "\n\n" +} + +fn xml_escape(value: &str) -> String { + escape(value).into_owned() +} + +fn push_text_element(xml: &mut String, name: &str, text: &str) { + xml.push('<'); + xml.push_str(name); + xml.push('>'); + xml.push_str(&xml_escape(text)); + xml.push_str("'); +} + +fn xml_response(body: String) -> HttpResponse { + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(body) +} + +#[actix_web::main] +pub async fn start_server( + host: String, + port: String, + threads: usize, + keep_alive: u64, + public_url: String, + gql_endpoint: String, + export_url: String, +) -> io::Result<()> { + env_logger::init_from_env(env_logger::Env::new().default_filter_or("info")); + let state = AppState { + service: OaiService::new(public_url, gql_endpoint, export_url), + }; + + HttpServer::new(move || { + App::new() + .wrap(Logger::new(LOG_FORMAT)) + .wrap(Cors::default().allowed_methods(vec!["GET", "OPTIONS"])) + .app_data(web::Data::new(state.clone())) + .service(web::resource("/").route(web::get().to(index))) + .service(web::resource("/oai").route(web::get().to(oai))) + .service(web::resource("/oai2.xsl").route(web::get().to(stylesheet))) + .default_service(web::route().to(not_found)) + }) + .workers(threads) + .keep_alive(Duration::from_secs(keep_alive)) + .bind(format!("{host}:{port}"))? + .run() + .await +} diff --git a/thoth-oai-server/src/metadata.rs b/thoth-oai-server/src/metadata.rs new file mode 100644 index 000000000..b96ee42cc --- /dev/null +++ b/thoth-oai-server/src/metadata.rs @@ -0,0 +1,731 @@ +use quick_xml::escape::escape; +use thoth_api::markup::{convert_from_jats, ConversionLimit, MarkupFormat}; +use thoth_client::{ + AbstractType, ContributionType, LanguageRelation, PublicationType, RelationType, SubjectType, + Work, WorkAbstracts, WorkContributions, WorkLanguages, WorkTitles, +}; +use thoth_errors::ThothResult; + +fn xml_escape(value: &str) -> String { + escape(value).into_owned() +} + +fn push_text_element(xml: &mut String, name: &str, text: &str) { + xml.push('<'); + xml.push_str(name); + xml.push('>'); + xml.push_str(&xml_escape(text)); + xml.push_str("'); +} + +fn push_text_element_attrs(xml: &mut String, name: &str, attrs: &[(&str, String)], text: &str) { + xml.push('<'); + xml.push_str(name); + for (key, value) in attrs { + xml.push(' '); + xml.push_str(key); + xml.push_str("=\""); + xml.push_str(&xml_escape(value)); + xml.push('"'); + } + xml.push('>'); + xml.push_str(&xml_escape(text)); + xml.push_str("'); +} + +fn push_open_tag(xml: &mut String, name: &str, attrs: &[(&str, String)]) { + xml.push('<'); + xml.push_str(name); + for (key, value) in attrs { + xml.push(' '); + xml.push_str(key); + xml.push_str("=\""); + xml.push_str(&xml_escape(value)); + xml.push('"'); + } + xml.push('>'); +} + +fn push_close_tag(xml: &mut String, name: &str) { + xml.push_str("'); +} + +fn doi_url(doi: &thoth_api::model::Doi) -> String { + format!("https://doi.org/{doi}") +} + +fn orcid_url(orcid: &thoth_api::model::Orcid) -> String { + format!("https://orcid.org/{orcid}") +} + +fn ror_url(ror: &thoth_api::model::Ror) -> String { + format!("https://ror.org/{ror}") +} + +fn work_url(work: &Work) -> String { + format!("https://thoth.pub/books/{}", work.work_id) +} + +fn canonical_title(work: &Work) -> Option<&WorkTitles> { + work.titles + .iter() + .find(|title| title.canonical) + .or_else(|| work.titles.first()) +} + +fn canonical_long_abstract(work: &Work) -> Option<&WorkAbstracts> { + work.abstracts + .iter() + .find(|abstract_record| { + abstract_record.abstract_type == AbstractType::LONG && abstract_record.canonical + }) + .or_else(|| { + work.abstracts + .iter() + .find(|abstract_record| abstract_record.abstract_type == AbstractType::LONG) + }) +} + +fn canonical_short_abstract(work: &Work) -> Option<&WorkAbstracts> { + work.abstracts + .iter() + .find(|abstract_record| { + abstract_record.abstract_type == AbstractType::SHORT && abstract_record.canonical + }) + .or_else(|| { + work.abstracts + .iter() + .find(|abstract_record| abstract_record.abstract_type == AbstractType::SHORT) + }) +} + +fn abstract_text(abstract_record: Option<&WorkAbstracts>) -> ThothResult> { + abstract_record + .map(|abstract_record| { + convert_from_jats( + &abstract_record.content, + MarkupFormat::PlainText, + ConversionLimit::Abstract, + ) + }) + .transpose() +} + +fn creators(work: &Work) -> impl Iterator { + work.contributions + .iter() + .filter(|contribution| contribution.contribution_type == ContributionType::AUTHOR) +} + +fn contributors(work: &Work) -> impl Iterator { + work.contributions + .iter() + .filter(|contribution| contribution.contribution_type != ContributionType::AUTHOR) +} + +fn main_language(work: &Work) -> Option<&WorkLanguages> { + match work.languages.as_slice() { + [] => None, + [language] => Some(language), + _ => work + .languages + .iter() + .min_by_key(|language| match language.language_relation { + LanguageRelation::TRANSLATED_INTO => 0, + LanguageRelation::ORIGINAL => 1, + LanguageRelation::TRANSLATED_FROM => 2, + _ => 3, + }), + } +} + +fn personal_name(contribution: &WorkContributions) -> String { + match contribution.first_name.as_deref() { + Some(first_name) if !first_name.is_empty() && !contribution.last_name.is_empty() => { + format!("{}, {}", contribution.last_name, first_name) + } + _ if !contribution.full_name.is_empty() => contribution.full_name.clone(), + _ => contribution.last_name.clone(), + } +} + +fn publication_type_value(publication_type: &PublicationType) -> &'static str { + match publication_type { + PublicationType::HARDBACK => "hardback", + PublicationType::PAPERBACK => "paperback", + PublicationType::PDF => "application/pdf", + PublicationType::EPUB => "application/epub+zip", + PublicationType::XML => "application/xml", + PublicationType::HTML => "text/html", + PublicationType::DOCX => { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } + PublicationType::MP3 => "audio/mpeg", + PublicationType::WAV => "audio/wav", + PublicationType::MOBI => "application/x-mobipocket-ebook", + PublicationType::AZW3 => "application/vnd.amazon.ebook", + PublicationType::FICTION_BOOK => "application/x-fictionbook+xml", + PublicationType::Other(_) => "application/octet-stream", + } +} + +fn dc_type(work: &Work) -> &'static str { + match work.work_type { + thoth_client::WorkType::JOURNAL_ISSUE => "issue", + thoth_client::WorkType::BOOK_CHAPTER => "chapter", + thoth_client::WorkType::Other(_) => "book", + _ => "book", + } +} + +fn openaire_resource_type(work: &Work) -> Option<(&'static str, &'static str)> { + match work.work_type { + thoth_client::WorkType::JOURNAL_ISSUE => { + Some(("http://purl.org/coar/resource_type/c_0640", "journal")) + } + thoth_client::WorkType::BOOK_CHAPTER => { + Some(("http://purl.org/coar/resource_type/c_3248", "book part")) + } + thoth_client::WorkType::MONOGRAPH + | thoth_client::WorkType::TEXTBOOK + | thoth_client::WorkType::EDITED_BOOK + | thoth_client::WorkType::BOOK_SET => { + Some(("http://purl.org/coar/resource_type/c_2f33", "book")) + } + thoth_client::WorkType::Other(_) => None, + } +} + +fn normalized_license_name(license: &str) -> &str { + match license.trim_end_matches('/') { + "http://creativecommons.org/publicdomain/zero/1.0" => "CC0 1.0 Universal", + "http://creativecommons.org/licenses/by/4.0" => "CC BY 4.0", + "http://creativecommons.org/licenses/by-sa/4.0" => "CC BY-SA 4.0", + "http://creativecommons.org/licenses/by-nc/4.0" => "CC BY-NC 4.0", + "http://creativecommons.org/licenses/by-nc-sa/4.0" => "CC BY-NC-SA 4.0", + "http://creativecommons.org/licenses/by-nd/4.0" => "CC BY-ND 4.0", + "http://creativecommons.org/licenses/by-nc-nd/4.0" => "CC BY-NC-ND 4.0", + "http://creativecommons.org/licenses/by/3.0" => "CC BY 3.0", + "http://creativecommons.org/licenses/by-sa/3.0" => "CC BY-SA 3.0", + "http://creativecommons.org/licenses/by-nc/3.0" => "CC BY-NC 3.0", + "http://creativecommons.org/licenses/by-nc-sa/3.0" => "CC BY-NC-SA 3.0", + "http://creativecommons.org/licenses/by-nd/3.0" => "CC BY-ND 3.0", + "http://creativecommons.org/licenses/by-nc-nd/3.0" => "CC BY-NC-ND 3.0", + _ => license, + } +} + +fn parent_work(work: &Work) -> Option<&thoth_client::WorkRelationsRelatedWork> { + work.relations + .iter() + .find(|relation| relation.relation_type == RelationType::IS_CHILD_OF) + .map(|relation| &relation.related_work) +} + +pub(crate) fn map_oai_dc(work: &Work) -> ThothResult { + let mut xml = String::from( + r#""#, + ); + + if let Some(title) = canonical_title(work) { + push_text_element(&mut xml, "dc:title", &title.full_title); + } + + for creator in creators(work) { + push_text_element(&mut xml, "dc:creator", &creator.full_name); + } + + for subject in work + .subjects + .iter() + .filter(|subject| subject.subject_type == SubjectType::KEYWORD) + { + push_text_element(&mut xml, "dc:subject", &subject.subject_code); + } + + if let Some(description) = abstract_text(canonical_long_abstract(work))? { + push_text_element(&mut xml, "dc:description", &description); + } + + push_text_element( + &mut xml, + "dc:publisher", + &work.imprint.publisher.publisher_name, + ); + + for contributor in contributors(work) { + push_text_element(&mut xml, "dc:contributor", &contributor.full_name); + } + + if let Some(publication_date) = &work.publication_date { + push_text_element(&mut xml, "dc:date", &publication_date.to_string()); + } + + push_text_element(&mut xml, "dc:type", dc_type(work)); + + for publication in &work.publications { + push_text_element( + &mut xml, + "dc:format", + publication_type_value(&publication.publication_type), + ); + } + + push_text_element(&mut xml, "dc:identifier", &work_url(work)); + if let Some(doi) = &work.doi { + push_text_element(&mut xml, "dc:identifier", &doi_url(doi)); + } + for publication in &work.publications { + if let Some(isbn) = &publication.isbn { + push_text_element(&mut xml, "dc:identifier", &format!("urn:isbn:{isbn}")); + } + } + + if let Some(language) = main_language(work) { + push_text_element( + &mut xml, + "dc:language", + &language.language_code.to_string().to_lowercase(), + ); + } + + for relation in &work.relations { + if let Some(doi) = &relation.related_work.doi { + push_text_element(&mut xml, "dc:relation", &doi_url(doi)); + } + for publication in &relation.related_work.publications { + if let Some(isbn) = &publication.isbn { + push_text_element(&mut xml, "dc:relation", &format!("urn:isbn:{isbn}")); + } + } + } + + if let Some(license) = &work.license { + push_text_element(&mut xml, "dc:rights", license); + } + + xml.push_str(""); + Ok(xml) +} + +pub(crate) fn map_oai_openaire(work: &Work) -> ThothResult { + let mut xml = String::from( + r#""#, + ); + + push_text_element_attrs( + &mut xml, + "datacite:identifier", + &[("identifierType", "URL".to_string())], + &work_url(work), + ); + + push_open_tag(&mut xml, "datacite:titles", &[]); + if let Some(title) = canonical_title(work) { + push_text_element(&mut xml, "datacite:title", &title.title); + if let Some(subtitle) = &title.subtitle { + push_text_element_attrs( + &mut xml, + "datacite:title", + &[("titleType", "Subtitle".to_string())], + subtitle, + ); + } + } + push_close_tag(&mut xml, "datacite:titles"); + + push_open_tag(&mut xml, "datacite:creators", &[]); + for creator in creators(work) { + push_open_tag(&mut xml, "datacite:creator", &[]); + push_text_element_attrs( + &mut xml, + "datacite:creatorName", + &[("nameType", "Personal".to_string())], + &personal_name(creator), + ); + if let Some(first_name) = creator.first_name.as_deref() { + if !first_name.is_empty() { + push_text_element(&mut xml, "datacite:givenName", first_name); + } + } + if !creator.last_name.is_empty() { + push_text_element(&mut xml, "datacite:familyName", &creator.last_name); + } + if let Some(orcid) = &creator.contributor.orcid { + push_text_element_attrs( + &mut xml, + "datacite:nameIdentifier", + &[ + ("nameIdentifierScheme", "ORCID".to_string()), + ("schemeURI", "https://orcid.org/".to_string()), + ], + &orcid_url(orcid), + ); + } + for affiliation in &creator.affiliations { + if let Some(ror) = &affiliation.institution.ror { + push_text_element_attrs( + &mut xml, + "datacite:affiliation", + &[("affiliationIdentifier", ror_url(ror))], + &affiliation.institution.institution_name, + ); + } else { + push_text_element( + &mut xml, + "datacite:affiliation", + &affiliation.institution.institution_name, + ); + } + } + push_close_tag(&mut xml, "datacite:creator"); + } + push_close_tag(&mut xml, "datacite:creators"); + + push_open_tag(&mut xml, "datacite:contributors", &[]); + for contributor in contributors(work) { + let contributor_type = if contributor.contribution_type == ContributionType::EDITOR { + "Editor" + } else { + "Other" + }; + push_open_tag( + &mut xml, + "datacite:contributor", + &[("contributorType", contributor_type.to_string())], + ); + push_text_element_attrs( + &mut xml, + "datacite:creatorName", + &[("nameType", "Personal".to_string())], + &personal_name(contributor), + ); + if let Some(first_name) = contributor.first_name.as_deref() { + if !first_name.is_empty() { + push_text_element(&mut xml, "datacite:givenName", first_name); + } + } + if !contributor.last_name.is_empty() { + push_text_element(&mut xml, "datacite:familyName", &contributor.last_name); + } + if let Some(orcid) = &contributor.contributor.orcid { + push_text_element_attrs( + &mut xml, + "datacite:nameIdentifier", + &[ + ("nameIdentifierScheme", "ORCID".to_string()), + ("schemeURI", "https://orcid.org/".to_string()), + ], + &orcid_url(orcid), + ); + } + for affiliation in &contributor.affiliations { + if let Some(ror) = &affiliation.institution.ror { + push_text_element_attrs( + &mut xml, + "datacite:affiliation", + &[("affiliationIdentifier", ror_url(ror))], + &affiliation.institution.institution_name, + ); + } else { + push_text_element( + &mut xml, + "datacite:affiliation", + &affiliation.institution.institution_name, + ); + } + } + push_close_tag(&mut xml, "datacite:contributor"); + } + push_close_tag(&mut xml, "datacite:contributors"); + + push_open_tag(&mut xml, "oaire:fundingReferences", &[]); + for funding in &work.fundings { + push_open_tag(&mut xml, "oaire:fundingReference", &[]); + push_text_element( + &mut xml, + "oaire:funderName", + &funding.institution.institution_name, + ); + if let Some(ror) = &funding.institution.ror { + push_text_element_attrs( + &mut xml, + "oaire:funderIdentifier", + &[("funderIdentifierType", "ROR".to_string())], + &ror_url(ror), + ); + } + if let Some(grant_number) = &funding.grant_number { + push_text_element(&mut xml, "oaire:awardNumber", grant_number); + } + if let Some(project_name) = &funding.project_name { + push_text_element(&mut xml, "oaire:awardTitle", project_name); + } + push_close_tag(&mut xml, "oaire:fundingReference"); + } + push_close_tag(&mut xml, "oaire:fundingReferences"); + + push_open_tag(&mut xml, "datacite:alternateIdentifiers", &[]); + if let Some(doi) = &work.doi { + push_text_element_attrs( + &mut xml, + "datacite:alternateIdentifier", + &[("alternateIdentifierType", "DOI".to_string())], + &doi_url(doi), + ); + } + if let Some(landing_page) = &work.landing_page { + push_text_element_attrs( + &mut xml, + "datacite:alternateIdentifier", + &[("alternateIdentifierType", "URL".to_string())], + landing_page, + ); + } + for publication in &work.publications { + if let Some(isbn) = &publication.isbn { + push_text_element_attrs( + &mut xml, + "datacite:alternateIdentifier", + &[("alternateIdentifierType", "ISBN".to_string())], + &isbn.to_string(), + ); + } + } + push_close_tag(&mut xml, "datacite:alternateIdentifiers"); + + push_open_tag(&mut xml, "datacite:relatedIdentifiers", &[]); + for relation in &work.relations { + let relation_type = if matches!( + relation.relation_type, + RelationType::HAS_CHILD | RelationType::HAS_PART + ) { + "HasPart" + } else { + "IsPartOf" + }; + if let Some(doi) = &relation.related_work.doi { + push_text_element_attrs( + &mut xml, + "datacite:relatedIdentifier", + &[ + ("relatedIdentifierType", "DOI".to_string()), + ("relationType", relation_type.to_string()), + ], + &doi_url(doi), + ); + } + if let Some(landing_page) = &relation.related_work.landing_page { + push_text_element_attrs( + &mut xml, + "datacite:relatedIdentifier", + &[ + ("relatedIdentifierType", "URL".to_string()), + ("relationType", relation_type.to_string()), + ], + landing_page, + ); + } + for publication in &relation.related_work.publications { + if let Some(isbn) = &publication.isbn { + push_text_element_attrs( + &mut xml, + "datacite:relatedIdentifier", + &[ + ("relatedIdentifierType", "ISBN".to_string()), + ("relationType", relation_type.to_string()), + ], + &isbn.to_string(), + ); + } + } + } + push_close_tag(&mut xml, "datacite:relatedIdentifiers"); + + for language in &work.languages { + push_text_element( + &mut xml, + "dc:language", + &language.language_code.to_string().to_lowercase(), + ); + } + + push_text_element( + &mut xml, + "dc:publisher", + &work.imprint.publisher.publisher_name, + ); + + if let Some(publication_date) = &work.publication_date { + push_text_element_attrs( + &mut xml, + "datacite:date", + &[("dateType", "Issued".to_string())], + &publication_date.to_string(), + ); + } + + if let Some((uri, value)) = openaire_resource_type(work) { + push_text_element_attrs( + &mut xml, + "oaire:resourceType", + &[ + ("resourceTypeGeneral", "literature".to_string()), + ("uri", uri.to_string()), + ], + value, + ); + } + + if let Some(short_abstract) = abstract_text(canonical_short_abstract(work))? { + push_text_element(&mut xml, "dc:description", &short_abstract); + } + if let Some(long_abstract) = abstract_text(canonical_long_abstract(work))? { + push_text_element(&mut xml, "dc:description", &long_abstract); + } + if let Some(toc) = &work.toc { + push_text_element(&mut xml, "dc:description", toc); + } + + for publication in &work.publications { + push_text_element( + &mut xml, + "dc:format", + publication_type_value(&publication.publication_type), + ); + } + + if let Some(license) = &work.license { + push_text_element_attrs( + &mut xml, + "datacite:rights", + &[( + "rightsURI", + "http://purl.org/coar/access_right/c_abf2".to_string(), + )], + "open access", + ); + push_text_element_attrs( + &mut xml, + "oaire:licenseCondition", + &[("uri", license.clone())], + normalized_license_name(license), + ); + } else { + push_text_element_attrs( + &mut xml, + "datacite:rights", + &[( + "rightsURI", + "http://purl.org/coar/access_right/c_16ec".to_string(), + )], + "restricted access", + ); + } + + for subject in &work.subjects { + match subject.subject_type { + SubjectType::KEYWORD | SubjectType::CUSTOM => { + push_text_element(&mut xml, "datacite:subject", &subject.subject_code); + } + SubjectType::THEMA => { + push_text_element_attrs( + &mut xml, + "datacite:subject", + &[("subjectScheme", "Thema".to_string())], + &subject.subject_code, + ); + } + _ => { + push_text_element_attrs( + &mut xml, + "datacite:subject", + &[("subjectScheme", subject.subject_type.to_string())], + &subject.subject_code, + ); + } + } + } + + let mut sizes = Vec::new(); + if let Some(page_count) = work.page_count { + sizes.push(format!("{page_count} pages")); + } + if let Some(image_count) = work.image_count { + sizes.push(format!("{image_count} images")); + } + if let Some(table_count) = work.table_count { + sizes.push(format!("{table_count} tables")); + } + if let Some(audio_count) = work.audio_count { + sizes.push(format!("{audio_count} audios")); + } + if let Some(video_count) = work.video_count { + sizes.push(format!("{video_count} videos")); + } + if !sizes.is_empty() { + push_open_tag(&mut xml, "datacite:sizes", &[]); + for size in sizes { + push_text_element(&mut xml, "datacite:size", &size); + } + push_close_tag(&mut xml, "datacite:sizes"); + } + + for publication in &work.publications { + for location in &publication.locations { + if let Some(full_text_url) = &location.full_text_url { + push_text_element_attrs( + &mut xml, + "oaire:file", + &[ + ( + "mimeType", + publication_type_value(&publication.publication_type).to_string(), + ), + ("objectType", "fulltext".to_string()), + ], + full_text_url, + ); + } + } + } + + if work.work_type == thoth_client::WorkType::BOOK_CHAPTER { + if let Some(parent_work) = parent_work(work) { + if let Some(parent_title) = parent_work + .titles + .iter() + .find(|title| title.canonical) + .or_else(|| parent_work.titles.first()) + { + push_text_element(&mut xml, "oaire:citationTitle", &parent_title.full_title); + } + if let Some(edition) = parent_work.edition { + push_text_element(&mut xml, "oaire:citationEdition", &edition.to_string()); + } + } + } else if let Some(issue) = work.issues.first() { + push_text_element(&mut xml, "oaire:citationTitle", &issue.series.series_name); + push_text_element( + &mut xml, + "oaire:citationIssue", + &issue.issue_ordinal.to_string(), + ); + } + + if let Some(first_page) = &work.first_page { + push_text_element(&mut xml, "oaire:citationStartPage", first_page); + } + if let Some(last_page) = &work.last_page { + push_text_element(&mut xml, "oaire:citationEndPage", last_page); + } + + xml.push_str(""); + Ok(xml) +} diff --git a/thoth-oai-server/src/service.rs b/thoth-oai-server/src/service.rs new file mode 100644 index 000000000..6605c7046 --- /dev/null +++ b/thoth-oai-server/src/service.rs @@ -0,0 +1,527 @@ +use std::sync::Arc; + +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; +use quick_xml::{events::Event, Reader, Writer}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use thoth_api::model::Timestamp; +use thoth_client::{Publisher, QueryParameters, ThothClient, Work}; +use thoth_errors::{ThothError, ThothResult}; +use uuid::Uuid; + +pub(crate) const RECORD_PREFIX: &str = "oai:thoth.pub"; +pub(crate) const REPOSITORY_NAME: &str = "Thoth OAI-PMH Repository"; +pub(crate) const ADMIN_EMAIL: &str = "support@thoth.pub"; +pub(crate) const SAMPLE_ID: &str = "5a08ff03-7d53-42a9-bfb5-7fc81c099c52"; +pub(crate) const PAGE_LIMIT: i64 = 50; + +#[derive(Clone)] +pub(crate) struct OaiService { + public_url: String, + export_url: String, + thoth_client: Arc, + export_client: Client, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) enum MetadataPrefix { + OaiDc, + OaiOpenaire, + MarcXml, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct ResumptionToken { + pub offset: i64, + pub metadata_prefix: MetadataPrefix, + pub set: Option, + pub identifiers_only: bool, +} + +#[derive(Debug, Clone)] +pub(crate) struct SetRecord { + pub publisher_id: Uuid, + pub spec: String, + pub name: String, +} + +#[derive(Debug, Clone)] +pub(crate) struct RecordPage { + pub records: Vec, + pub cursor: i64, + pub complete_list_size: i64, + pub next_token: Option, +} + +impl MetadataPrefix { + pub fn as_str(self) -> &'static str { + match self { + MetadataPrefix::OaiDc => "oai_dc", + MetadataPrefix::OaiOpenaire => "oai_openaire", + MetadataPrefix::MarcXml => "marcxml", + } + } + + pub fn schema(self) -> &'static str { + match self { + MetadataPrefix::OaiDc => "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", + MetadataPrefix::OaiOpenaire => { + "https://www.openaire.eu/schema/repo-lit/4.0/openaire.xsd" + } + MetadataPrefix::MarcXml => { + "https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" + } + } + } + + pub fn namespace(self) -> &'static str { + match self { + MetadataPrefix::OaiDc => "http://www.openarchives.org/OAI/2.0/oai_dc/", + MetadataPrefix::OaiOpenaire => "http://namespace.openaire.eu/schema/oaire/", + MetadataPrefix::MarcXml => "https://www.loc.gov/standards/marcxml/", + } + } +} + +impl TryFrom<&str> for MetadataPrefix { + type Error = ThothError; + + fn try_from(value: &str) -> ThothResult { + match value { + "oai_dc" => Ok(Self::OaiDc), + "oai_openaire" => Ok(Self::OaiOpenaire), + "marcxml" => Ok(Self::MarcXml), + other => Err(ThothError::InvalidMetadataSpecification(other.to_string())), + } + } +} + +impl OaiService { + pub(crate) fn new(public_url: String, gql_endpoint: String, export_url: String) -> Self { + Self { + public_url, + export_url, + thoth_client: Arc::new(ThothClient::new(gql_endpoint)), + export_client: Client::new(), + } + } + + pub(crate) fn repository_url(&self) -> String { + format!("{}/oai", self.public_url.trim_end_matches('/')) + } + + pub(crate) async fn earliest(&self) -> ThothResult { + self.thoth_client.get_oai_earliest_works_updated().await + } + + pub(crate) async fn latest(&self) -> ThothResult { + self.thoth_client.get_oai_latest_works_updated().await + } + + pub(crate) async fn list_sets(&self) -> ThothResult> { + let publishers = self.thoth_client.get_publishers().await?; + Ok(publishers.into_iter().map(Self::to_set_record).collect()) + } + + pub(crate) async fn get_record( + &self, + identifier: Uuid, + metadata_prefix: MetadataPrefix, + ) -> ThothResult { + let work = self + .thoth_client + .get_work(identifier, Self::query_parameters()) + .await?; + if metadata_prefix == MetadataPrefix::MarcXml && !Self::is_marcxml_record_candidate(&work) { + return Err(ThothError::IncompleteMetadataRecord( + metadata_prefix.as_str().to_string(), + "Record cannot be disseminated as MARCXML".to_string(), + )); + } + Ok(work) + } + + pub(crate) async fn list_records( + &self, + metadata_prefix: MetadataPrefix, + set: Option, + offset: i64, + identifiers_only: bool, + ) -> ThothResult { + let set_record = self.find_set(set.as_deref()).await?; + let publishers = set_record + .as_ref() + .map(|set_record| vec![set_record.publisher_id]); + let cursor = offset; + + if metadata_prefix == MetadataPrefix::MarcXml { + let total = self + .thoth_client + .get_oai_book_count(publishers.clone()) + .await?; + let mut records = Vec::new(); + let mut raw_offset = offset; + + while raw_offset < total && records.len() < PAGE_LIMIT as usize { + let batch = self + .thoth_client + .get_oai_books( + publishers.clone(), + PAGE_LIMIT, + raw_offset, + Self::query_parameters(), + ) + .await?; + if batch.is_empty() { + break; + } + raw_offset += batch.len() as i64; + for work in batch { + if Self::is_marcxml_record_candidate(&work) { + records.push(work); + if records.len() == PAGE_LIMIT as usize { + break; + } + } + } + } + + let next_token = (raw_offset < total && !records.is_empty()).then(|| { + Self::encode_resumption_token(ResumptionToken { + offset: raw_offset, + metadata_prefix, + set, + identifiers_only, + }) + }); + + return Ok(RecordPage { + records, + cursor, + complete_list_size: total, + next_token, + }); + } + + let total = self + .thoth_client + .get_oai_work_count(publishers.clone()) + .await?; + let records = self + .thoth_client + .get_oai_works(publishers, PAGE_LIMIT, offset, Self::query_parameters()) + .await?; + let next_offset = offset + records.len() as i64; + let next_token = (next_offset < total && !records.is_empty()).then(|| { + Self::encode_resumption_token(ResumptionToken { + offset: next_offset, + metadata_prefix, + set, + identifiers_only, + }) + }); + + Ok(RecordPage { + records, + cursor, + complete_list_size: total, + next_token, + }) + } + + pub(crate) async fn get_marcxml_record(&self, work_id: Uuid) -> ThothResult { + let response = self + .export_client + .get(format!( + "{}/specifications/marc21xml::thoth/work/{}", + self.export_url.trim_end_matches('/'), + work_id + )) + .send() + .await + .map_err(|error| ThothError::RequestError(error.to_string()))?; + + let status = response.status(); + let body = response + .text() + .await + .map_err(|error| ThothError::RequestError(error.to_string()))?; + if !status.is_success() { + return Err(ThothError::RequestError(format!( + "Export {}: {}", + status.as_u16(), + body + ))); + } + + Self::extract_marc_record(&body) + } + + pub(crate) fn oai_identifier(work_id: Uuid) -> String { + format!("{RECORD_PREFIX}:{work_id}") + } + + pub(crate) fn parse_oai_identifier(identifier: &str) -> ThothResult { + identifier + .rsplit_once(':') + .map(|(_, value)| value) + .ok_or(ThothError::InvalidUuid) + .and_then(|value| Uuid::parse_str(value).map_err(|_| ThothError::InvalidUuid)) + } + + pub(crate) fn encode_resumption_token(token: ResumptionToken) -> String { + URL_SAFE_NO_PAD.encode(serde_json::to_vec(&token).expect("resumption token to serialize")) + } + + pub(crate) fn decode_resumption_token(value: &str) -> ThothResult { + let bytes = URL_SAFE_NO_PAD + .decode(value) + .map_err(|_| ThothError::RequestError("badResumptionToken".to_string()))?; + serde_json::from_slice(&bytes) + .map_err(|_| ThothError::RequestError("badResumptionToken".to_string())) + } + + pub(crate) fn timestamp_xml(timestamp: Timestamp) -> String { + timestamp.to_rfc3339().replace("+00:00", "Z") + } + + pub(crate) fn set_spec(publisher_name: &str) -> String { + publisher_name + .chars() + .filter(|ch| ch.is_alphanumeric() || ch.is_whitespace() || *ch == '_') + .collect::() + .to_lowercase() + .split_whitespace() + .collect::>() + .join("-") + } + + pub(crate) fn is_marcxml_record_candidate(work: &Work) -> bool { + !work.contributions.is_empty() + && !work.languages.is_empty() + && work + .publications + .iter() + .any(|publication| publication.isbn.is_some()) + } + + pub(crate) fn query_parameters() -> QueryParameters { + QueryParameters::new() + .with_canonical_abstracts_only() + .with_canonical_title_only() + .with_issues() + .with_languages() + .with_publications() + .with_subjects() + .with_fundings() + .with_relations() + } + + async fn find_set(&self, set_spec: Option<&str>) -> ThothResult> { + let Some(set_spec) = set_spec else { + return Ok(None); + }; + + let sets = self.list_sets().await?; + sets.into_iter() + .find(|set_record| set_record.spec == set_spec) + .map(Some) + .ok_or(ThothError::EntityNotFound) + } + + fn to_set_record(publisher: Publisher) -> SetRecord { + let spec = Self::set_spec(&publisher.publisher_name); + SetRecord { + publisher_id: publisher.publisher_id, + spec, + name: publisher.publisher_name, + } + } + + fn extract_marc_record(body: &str) -> ThothResult { + let mut reader = Reader::from_str(body); + reader.config_mut().trim_text(false); + let mut writer = Writer::new(Vec::new()); + let mut capture_depth = 0usize; + let mut capturing = false; + + loop { + match reader.read_event() { + Ok(Event::Start(event)) => { + let is_record = event.local_name().as_ref() == b"record"; + if capturing { + capture_depth += 1; + writer + .write_event(Event::Start(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + } else if is_record { + capturing = true; + capture_depth = 1; + writer + .write_event(Event::Start(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + } + } + Ok(Event::Empty(event)) => { + let is_record = event.local_name().as_ref() == b"record"; + if capturing || is_record { + writer + .write_event(Event::Empty(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + if is_record && !capturing { + return String::from_utf8(writer.into_inner()).map_err(|_| { + ThothError::InternalError("Could not parse MARCXML".to_string()) + }); + } + } + } + Ok(Event::End(event)) => { + if capturing { + writer + .write_event(Event::End(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + capture_depth -= 1; + if capture_depth == 0 { + return String::from_utf8(writer.into_inner()).map_err(|_| { + ThothError::InternalError("Could not parse MARCXML".to_string()) + }); + } + } + } + Ok(Event::Text(event)) => { + if capturing { + writer + .write_event(Event::Text(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + } + } + Ok(Event::CData(event)) => { + if capturing { + writer + .write_event(Event::CData(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + } + } + Ok(Event::Comment(event)) => { + if capturing { + writer + .write_event(Event::Comment(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + } + } + Ok(Event::PI(event)) => { + if capturing { + writer + .write_event(Event::PI(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write MARCXML: {error}" + )) + })?; + } + } + Ok(Event::Decl(_)) | Ok(Event::DocType(_)) => {} + Ok(Event::Eof) => { + return Err(ThothError::InternalError( + "No marc:record element found".to_string(), + )); + } + Err(error) => { + return Err(ThothError::InternalError(format!( + "Could not parse MARCXML: {error}" + ))); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn set_spec_normalizes_publisher_name() { + assert_eq!( + OaiService::set_spec("Punctum Books, Inc."), + "punctum-books-inc" + ); + assert_eq!( + OaiService::set_spec("Open Access_ Press"), + "open-access_-press" + ); + } + + #[test] + fn oai_identifier_round_trip() { + let work_id = Uuid::parse_str("5a08ff03-7d53-42a9-bfb5-7fc81c099c52").unwrap(); + let identifier = OaiService::oai_identifier(work_id); + + assert_eq!( + OaiService::parse_oai_identifier(&identifier).unwrap(), + work_id + ); + } + + #[test] + fn resumption_token_round_trip() { + let token = ResumptionToken { + offset: 150, + metadata_prefix: MetadataPrefix::MarcXml, + set: Some("open-book-publishers".to_string()), + identifiers_only: true, + }; + + let encoded = OaiService::encode_resumption_token(token.clone()); + + assert_eq!( + OaiService::decode_resumption_token(&encoded).unwrap(), + token + ); + } + + #[test] + fn extract_marc_record_returns_record_element() { + let xml = r#" + + + 00000nam a2200000 i 4500 + 123 + +"#; + let record = OaiService::extract_marc_record(xml).unwrap(); + + assert!(record.starts_with("00000nam a2200000 i 4500")); + assert!(record.contains("123")); + assert!(!record.contains(" Date: Mon, 13 Apr 2026 16:17:36 +0100 Subject: [PATCH 02/19] Harden OAI --- Cargo.lock | 1 + thoth-oai-server/Cargo.toml | 1 + thoth-oai-server/README.md | 18 +- thoth-oai-server/src/lib.rs | 1455 ++++++++++++++++++++++++++++-- thoth-oai-server/src/metadata.rs | 12 +- thoth-oai-server/src/service.rs | 250 +++-- 6 files changed, 1609 insertions(+), 128 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a3776c9ee..93dedf2eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5122,6 +5122,7 @@ dependencies = [ "thoth-api", "thoth-client", "thoth-errors", + "url", "uuid", "xml-rs", ] diff --git a/thoth-oai-server/Cargo.toml b/thoth-oai-server/Cargo.toml index fb9082c30..9ebcaa6e5 100644 --- a/thoth-oai-server/Cargo.toml +++ b/thoth-oai-server/Cargo.toml @@ -22,5 +22,6 @@ quick-xml = "0.36" reqwest = { version = "0.12", features = ["json"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +url = "2.5" uuid = { version = "1.16.0", features = ["serde"] } xml-rs = "0.8.25" diff --git a/thoth-oai-server/README.md b/thoth-oai-server/README.md index 0712e6c76..f8fd3ab50 100644 --- a/thoth-oai-server/README.md +++ b/thoth-oai-server/README.md @@ -1,3 +1,17 @@ -# Thoth OAI Server +
    + + +

    Thoth OAI-PMH

    + +

    + Web server for Thoth's, metadata management and dissemination system, OAI-PMH +

    + +

    + GitHub Workflow + Thoth Releases + Crate Info + License Info +

    +
    -OAI-PMH server for the Thoth workspace. diff --git a/thoth-oai-server/src/lib.rs b/thoth-oai-server/src/lib.rs index 34962088e..fbbf2d72e 100644 --- a/thoth-oai-server/src/lib.rs +++ b/thoth-oai-server/src/lib.rs @@ -5,11 +5,11 @@ use std::{collections::HashMap, io, time::Duration}; use actix_cors::Cors; use actix_web::{middleware::Logger, web, App, HttpRequest, HttpResponse, HttpServer}; -use chrono::Utc; +use chrono::{DateTime, NaiveDate, Utc}; use quick_xml::escape::escape; use service::{ - MetadataPrefix, OaiService, RecordPage, ResumptionToken, ADMIN_EMAIL, RECORD_PREFIX, - REPOSITORY_NAME, SAMPLE_ID, + DatestampGranularity, MetadataPrefix, OaiService, RecordPage, ResumptionToken, ADMIN_EMAIL, + RECORD_PREFIX, REPOSITORY_NAME, SAMPLE_ID, }; use thoth_errors::ThothError; use uuid::Uuid; @@ -41,6 +41,18 @@ impl From for HandlerError { } } +#[derive(Debug, Default, Clone)] +struct ParsedParams { + values: HashMap, + has_repeated: bool, +} + +#[derive(Debug, Clone)] +struct ParsedListRequest { + token: ResumptionToken, + resumed: bool, +} + async fn index() -> HttpResponse { HttpResponse::Found() .append_header(("Location", "/oai")) @@ -53,17 +65,49 @@ async fn stylesheet() -> HttpResponse { .body(XSL_STYLESHEET) } -async fn oai( +async fn oai_get(request: HttpRequest, state: web::Data) -> HttpResponse { + let params = parse_form_encoded(request.query_string()); + oai_with_params(request, params, state).await +} + +async fn oai_post( request: HttpRequest, - params: web::Query>, + body: web::Bytes, state: web::Data, ) -> HttpResponse { - let params = params.into_inner(); - match handle_oai_request(&request, ¶ms, &state.service).await { - Ok(body) => xml_response(success_document(&state.service, ¶ms, &body)), + let mut params = parse_form_encoded(request.query_string()); + match std::str::from_utf8(&body) { + Ok(body) => params.merge(parse_form_encoded(body)), + Err(_) => { + return xml_response(error_document( + &state.service, + ¶ms.values, + "badArgument", + "Invalid UTF-8 request body", + )) + } + } + oai_with_params(request, params, state).await +} + +async fn oai_with_params( + request: HttpRequest, + params: ParsedParams, + state: web::Data, +) -> HttpResponse { + if params.has_repeated { + return xml_response(error_document( + &state.service, + ¶ms.values, + "badArgument", + "The request includes repeated arguments", + )); + } + match handle_oai_request(&request, ¶ms.values, &state.service).await { + Ok(body) => xml_response(success_document(&state.service, ¶ms.values, &body)), Err(HandlerError::Protocol(error)) => xml_response(error_document( &state.service, - ¶ms, + ¶ms.values, error.code, &error.message, )), @@ -76,6 +120,31 @@ async fn oai( } } +impl ParsedParams { + fn merge(&mut self, other: ParsedParams) { + self.has_repeated = self.has_repeated || other.has_repeated; + for (key, value) in other.values { + if self.values.insert(key, value).is_some() { + self.has_repeated = true; + } + } + } +} + +fn parse_form_encoded(input: &str) -> ParsedParams { + let mut parsed = ParsedParams::default(); + for (key, value) in url::form_urlencoded::parse(input.as_bytes()) { + if parsed + .values + .insert(key.into_owned(), value.into_owned()) + .is_some() + { + parsed.has_repeated = true; + } + } + parsed +} + async fn not_found() -> HttpResponse { HttpResponse::NotFound() .content_type("text/html; charset=utf-8") @@ -117,16 +186,40 @@ async fn handle_oai_request( } "ListMetadataFormats" => { require_only(params, &["verb", "identifier"])?; + let mut prefixes = vec![ + MetadataPrefix::OaiDc, + MetadataPrefix::OaiOpenaire, + MetadataPrefix::MarcXml, + ]; if let Some(identifier) = params.get("identifier") { - let work_id = parse_identifier(identifier)?; - service + let work_id = parse_identifier_for_lookup(identifier)?; + let work = service .get_record(work_id, MetadataPrefix::OaiDc) .await .map_err(map_get_record_error(MetadataPrefix::OaiDc))?; + prefixes = vec![MetadataPrefix::OaiDc, MetadataPrefix::OaiOpenaire]; + if OaiService::is_marcxml_record_candidate(&work) { + prefixes.push(MetadataPrefix::MarcXml); + } + if prefixes.is_empty() { + return Err(ProtocolError { + code: "noMetadataFormats", + message: "No metadata formats are available for this identifier" + .to_string(), + } + .into()); + } } - Ok(render_list_metadata_formats()) + Ok(render_list_metadata_formats(&prefixes)) } "ListSets" => { + if params.contains_key("resumptionToken") { + return Err(ProtocolError { + code: "badResumptionToken", + message: "This repository does not support set resumption tokens".to_string(), + } + .into()); + } require_only(params, &["verb"])?; let sets = service.list_sets().await.map_err(HandlerError::Internal)?; Ok(render_list_sets(&sets)) @@ -139,7 +232,7 @@ async fn handle_oai_request( let metadata_prefix = params .get("metadataPrefix") .ok_or_else(|| bad_argument("Missing metadataPrefix parameter"))?; - let work_id = parse_identifier(identifier)?; + let work_id = parse_identifier_for_lookup(identifier)?; let metadata_prefix = parse_metadata_prefix(metadata_prefix)?; let work = service .get_record(work_id, metadata_prefix) @@ -149,9 +242,9 @@ async fn handle_oai_request( } "ListIdentifiers" => { validate_list_verb(params)?; - let token = parse_list_token(params, true)?; + let parsed = parse_list_token(params, true)?; let page = service - .list_records(token.metadata_prefix, token.set.clone(), token.offset, true) + .list_records(&parsed.token, parsed.resumed) .await .map_err(map_list_error)?; if page.records.is_empty() { @@ -161,20 +254,15 @@ async fn handle_oai_request( } "ListRecords" => { validate_list_verb(params)?; - let token = parse_list_token(params, false)?; + let parsed = parse_list_token(params, false)?; let page = service - .list_records( - token.metadata_prefix, - token.set.clone(), - token.offset, - false, - ) + .list_records(&parsed.token, parsed.resumed) .await .map_err(map_list_error)?; if page.records.is_empty() { return Err(HandlerError::Protocol(no_records_match())); } - Ok(render_list_records(service, &page, token.metadata_prefix).await?) + Ok(render_list_records(service, &page, parsed.token.metadata_prefix).await?) } other => Err(HandlerError::Protocol(bad_verb(&format!( "Unknown verb {other}" @@ -220,14 +308,9 @@ fn render_identify( ) } -fn render_list_metadata_formats() -> String { - let prefixes = [ - MetadataPrefix::OaiDc, - MetadataPrefix::OaiOpenaire, - MetadataPrefix::MarcXml, - ]; +fn render_list_metadata_formats(prefixes: &[MetadataPrefix]) -> String { let mut xml = String::from(""); - for prefix in prefixes { + for prefix in prefixes.iter().copied() { xml.push_str(""); push_text_element(&mut xml, "metadataPrefix", prefix.as_str()); push_text_element(&mut xml, "schema", prefix.schema()); @@ -266,7 +349,9 @@ fn render_list_identifiers(page: &RecordPage) -> String { for work in &page.records { xml.push_str(&render_header_xml(work)); } - if let Some(token) = &page.next_token { + if page.terminal_resumption_token { + xml.push_str(""); + } else if let Some(token) = &page.next_token { xml.push_str(&render_resumption_token( token, page.cursor, @@ -286,7 +371,9 @@ async fn render_list_records( for work in &page.records { xml.push_str(&render_record_xml(service, work, metadata_prefix).await?); } - if let Some(token) = &page.next_token { + if page.terminal_resumption_token { + xml.push_str(""); + } else if let Some(token) = &page.next_token { xml.push_str(&render_resumption_token( token, page.cursor, @@ -307,10 +394,20 @@ async fn render_record_xml( MetadataPrefix::OaiOpenaire => { metadata::map_oai_openaire(work).map_err(HandlerError::Internal)? } - MetadataPrefix::MarcXml => service - .get_marcxml_record(work.work_id) - .await - .map_err(map_get_record_error(metadata_prefix))?, + MetadataPrefix::MarcXml => { + service + .get_marcxml_record(work.work_id) + .await + .map_err(|_| { + HandlerError::Protocol(ProtocolError { + code: "cannotDisseminateFormat", + message: format!( + "Record cannot be disseminated as {}", + metadata_prefix.as_str() + ), + }) + })? + } }; Ok(format!( @@ -334,13 +431,21 @@ fn render_header_xml(work: &thoth_client::Work) -> String { ) } -fn render_resumption_token(token: &str, cursor: i64, complete_list_size: i64) -> String { - format!( - "{}", - cursor, - complete_list_size, - xml_escape(token) - ) +fn render_resumption_token(token: &str, cursor: i64, complete_list_size: Option) -> String { + if let Some(complete_list_size) = complete_list_size { + format!( + "{}", + cursor, + complete_list_size, + xml_escape(token) + ) + } else { + format!( + "{}", + cursor, + xml_escape(token) + ) + } } fn validate_list_verb(params: &HashMap) -> HandlerResult<()> { @@ -360,14 +465,14 @@ fn validate_list_verb(params: &HashMap) -> HandlerResult<()> { fn parse_list_token( params: &HashMap, identifiers_only: bool, -) -> HandlerResult { +) -> HandlerResult { if let Some(value) = params.get("resumptionToken") { if params.len() != 2 { return Err( bad_argument("resumptionToken cannot be combined with other arguments").into(), ); } - let token = OaiService::decode_resumption_token(value).map_err(|_| ProtocolError { + let mut token = OaiService::decode_resumption_token(value).map_err(|_| ProtocolError { code: "badResumptionToken", message: "Invalid resumptionToken".to_string(), })?; @@ -378,20 +483,160 @@ fn parse_list_token( } .into()); } - return Ok(token); + let (from, until, granularity) = parse_datestamp_filter( + token.from.as_deref(), + token.until.as_deref(), + token.granularity, + true, + )?; + token.from = from; + token.until = until; + token.granularity = granularity; + if token.scan_offset.is_none() { + token.scan_offset = Some(token.offset); + } + return Ok(ParsedListRequest { + token, + resumed: true, + }); } let metadata_prefix = params .get("metadataPrefix") .ok_or_else(|| bad_argument("Missing metadataPrefix parameter"))?; - Ok(ResumptionToken { - offset: 0, - metadata_prefix: parse_metadata_prefix(metadata_prefix)?, - set: params.get("set").cloned(), - identifiers_only, + let (from, until, granularity) = parse_datestamp_filter( + params.get("from").map(String::as_str), + params.get("until").map(String::as_str), + None, + false, + )?; + Ok(ParsedListRequest { + token: ResumptionToken { + offset: 0, + metadata_prefix: parse_metadata_prefix(metadata_prefix)?, + set: params.get("set").cloned(), + identifiers_only, + from, + until, + granularity, + scan_offset: Some(0), + }, + resumed: false, }) } +fn parse_datestamp_filter( + from: Option<&str>, + until: Option<&str>, + expected_granularity: Option, + is_resumption_token: bool, +) -> HandlerResult<(Option, Option, Option)> { + let parse_error = |message: &str| { + if is_resumption_token { + ProtocolError { + code: "badResumptionToken", + message: message.to_string(), + } + } else { + bad_argument(message) + } + }; + + let from_value = from + .map(|value| parse_datestamp_value(value, expected_granularity)) + .transpose() + .map_err(|message| parse_error(&message))?; + let until_value = until + .map(|value| parse_datestamp_value(value, expected_granularity)) + .transpose() + .map_err(|message| parse_error(&message))?; + + let mut granularity = expected_granularity; + if let Some((value_granularity, _)) = from_value { + granularity = Some(value_granularity); + } + if let Some((value_granularity, _)) = until_value { + if let Some(existing) = granularity { + if existing != value_granularity { + return Err( + parse_error("from and until must use the same datestamp granularity").into(), + ); + } + } else { + granularity = Some(value_granularity); + } + } + + let canonical_from = from_value.map(|(_, value)| value); + let canonical_until = until_value.map(|(_, value)| value); + + if let (Some(from_value), Some(until_value), Some(granularity)) = ( + canonical_from.as_deref(), + canonical_until.as_deref(), + granularity, + ) { + let ordered = match granularity { + DatestampGranularity::Day => from_value <= until_value, + DatestampGranularity::Second => { + let from = DateTime::parse_from_str(from_value, "%Y-%m-%dT%H:%M:%SZ") + .map_err(|_| parse_error("Invalid from datestamp"))?; + let until = DateTime::parse_from_str(until_value, "%Y-%m-%dT%H:%M:%SZ") + .map_err(|_| parse_error("Invalid until datestamp"))?; + from <= until + } + }; + if !ordered { + return Err(parse_error("from datestamp must be less than or equal to until").into()); + } + } + + Ok((canonical_from, canonical_until, granularity)) +} + +fn parse_datestamp_value( + value: &str, + expected_granularity: Option, +) -> Result<(DatestampGranularity, String), String> { + match expected_granularity { + Some(DatestampGranularity::Day) => NaiveDate::parse_from_str(value, "%Y-%m-%d") + .map(|date| { + ( + DatestampGranularity::Day, + date.format("%Y-%m-%d").to_string(), + ) + }) + .map_err(|_| "Invalid day datestamp".to_string()), + Some(DatestampGranularity::Second) => { + let datetime = DateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%SZ") + .map_err(|_| "Invalid second datestamp".to_string())?; + Ok(( + DatestampGranularity::Second, + datetime + .with_timezone(&Utc) + .format("%Y-%m-%dT%H:%M:%SZ") + .to_string(), + )) + } + None => { + if let Ok(date) = NaiveDate::parse_from_str(value, "%Y-%m-%d") { + return Ok(( + DatestampGranularity::Day, + date.format("%Y-%m-%d").to_string(), + )); + } + let datetime = DateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%SZ") + .map_err(|_| "Invalid datestamp".to_string())?; + Ok(( + DatestampGranularity::Second, + datetime + .with_timezone(&Utc) + .format("%Y-%m-%dT%H:%M:%SZ") + .to_string(), + )) + } + } +} + fn parse_metadata_prefix(value: &str) -> HandlerResult { MetadataPrefix::try_from(value).map_err(|_| { ProtocolError { @@ -402,8 +647,14 @@ fn parse_metadata_prefix(value: &str) -> HandlerResult { }) } -fn parse_identifier(value: &str) -> HandlerResult { - OaiService::parse_oai_identifier(value).map_err(|_| bad_argument("Invalid identifier").into()) +fn parse_identifier_for_lookup(value: &str) -> HandlerResult { + OaiService::parse_oai_identifier(value).map_err(|_| { + ProtocolError { + code: "idDoesNotExist", + message: "The requested identifier does not exist".to_string(), + } + .into() + }) } fn map_get_record_error( @@ -429,6 +680,12 @@ fn map_get_record_error( fn map_list_error(error: ThothError) -> HandlerError { match error { ThothError::EntityNotFound => no_records_match().into(), + ThothError::RequestError(message) if message.starts_with("badResumptionToken") => { + HandlerError::Protocol(ProtocolError { + code: "badResumptionToken", + message: "Invalid resumptionToken".to_string(), + }) + } other => HandlerError::Internal(other), } } @@ -468,7 +725,7 @@ fn success_document(service: &OaiService, params: &HashMap, body xml_declaration(), stylesheet_pi(), response_date(), - request_element(service, params), + request_element(service, params, true), body ) } @@ -479,27 +736,34 @@ fn error_document( code: &str, message: &str, ) -> String { + let include_attributes = !matches!(code, "badVerb" | "badArgument"); format!( "{}{}{}{}{}", xml_declaration(), stylesheet_pi(), response_date(), - request_element(service, params), + request_element(service, params, include_attributes), xml_escape(code), xml_escape(message) ) } -fn request_element(service: &OaiService, params: &HashMap) -> String { +fn request_element( + service: &OaiService, + params: &HashMap, + include_attributes: bool, +) -> String { let mut attrs = params.iter().collect::>(); attrs.sort_by(|(left, _), (right, _)| left.cmp(right)); let mut element = String::from("'); element.push_str(&xml_escape(&service.repository_url())); @@ -535,7 +799,7 @@ fn push_text_element(xml: &mut String, name: &str, text: &str) { fn xml_response(body: String) -> HttpResponse { HttpResponse::Ok() - .content_type("application/xml; charset=utf-8") + .content_type("text/xml; charset=utf-8") .body(body) } @@ -557,10 +821,14 @@ pub async fn start_server( HttpServer::new(move || { App::new() .wrap(Logger::new(LOG_FORMAT)) - .wrap(Cors::default().allowed_methods(vec!["GET", "OPTIONS"])) + .wrap(Cors::default().allowed_methods(vec!["GET", "POST", "OPTIONS"])) .app_data(web::Data::new(state.clone())) .service(web::resource("/").route(web::get().to(index))) - .service(web::resource("/oai").route(web::get().to(oai))) + .service( + web::resource("/oai") + .route(web::get().to(oai_get)) + .route(web::post().to(oai_post)), + ) .service(web::resource("/oai2.xsl").route(web::get().to(stylesheet))) .default_service(web::route().to(not_found)) }) @@ -570,3 +838,1060 @@ pub async fn start_server( .run() .await } + +#[cfg(test)] +mod tests { + use super::*; + use crate::service::PAGE_LIMIT; + use actix_web::{dev::ServerHandle, http::header, test, App, HttpResponse, HttpServer}; + use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; + use chrono::{Duration, NaiveDate}; + use serde_json::{json, Value}; + use std::{collections::HashSet, net::TcpListener}; + + const PUBLISHER_ID: &str = "00000000-0000-0000-1111-000000000001"; + const PUBLISHER_NAME: &str = "Open Access Press"; + + #[derive(Clone)] + struct MockGraphqlState { + works: Vec, + publishers: Vec, + latest: String, + earliest: String, + } + + #[derive(Clone, Default)] + struct MockExportState { + failing_work_ids: HashSet, + malformed_work_ids: HashSet, + } + + struct RunningMockServer { + base_url: String, + handle: ServerHandle, + } + + impl RunningMockServer { + async fn stop(self) { + self.handle.stop(true).await; + } + } + + async fn spawn_graphql_server(state: MockGraphqlState) -> RunningMockServer { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind graphql mock server"); + let address = listener.local_addr().expect("graphql local address"); + let state = web::Data::new(state); + + let server = HttpServer::new(move || { + App::new() + .app_data(state.clone()) + .route("/graphql", web::post().to(graphql_mock_handler)) + }) + .listen(listener) + .expect("listen graphql mock server") + .run(); + let handle = server.handle(); + actix_web::rt::spawn(server); + + RunningMockServer { + base_url: format!("http://{address}"), + handle, + } + } + + async fn spawn_export_server(state: MockExportState) -> RunningMockServer { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind export mock server"); + let address = listener.local_addr().expect("export local address"); + let state = web::Data::new(state); + + let server = HttpServer::new(move || { + App::new().app_data(state.clone()).route( + "/specifications/marc21xml::thoth/work/{work_id}", + web::get().to(export_mock_handler), + ) + }) + .listen(listener) + .expect("listen export mock server") + .run(); + let handle = server.handle(); + actix_web::rt::spawn(server); + + RunningMockServer { + base_url: format!("http://{address}"), + handle, + } + } + + async fn graphql_mock_handler( + state: web::Data, + payload: web::Json, + ) -> HttpResponse { + let payload = payload.into_inner(); + let variables = payload + .get("variables") + .cloned() + .unwrap_or_else(|| json!({})); + let operation_name = request_operation_name(&payload); + + let response = match operation_name.as_deref() { + Some("OaiLatestWorksUpdatedQuery") => { + json!({ "data": { "works": [{ "updatedAtWithRelations": state.latest.clone() }] } }) + } + Some("OaiEarliestWorksUpdatedQuery") => { + json!({ "data": { "works": [{ "updatedAtWithRelations": state.earliest.clone() }] } }) + } + Some("PublishersQuery") => { + json!({ "data": { "publishers": state.publishers.clone() } }) + } + Some("WorkQuery") => { + let work_id = variables + .get("workId") + .and_then(Value::as_str) + .map(ToOwned::to_owned); + match work_id.and_then(|work_id| find_work_by_id(&state, &work_id)) { + Some(work) => json!({ "data": { "work": work } }), + None => json!({ "errors": [{ "message": "work not found" }] }), + } + } + Some("OaiWorkCountQuery") => { + let works = filter_works_by_publishers(&state, &variables); + json!({ "data": { "workCount": works.len() as i64 } }) + } + Some("OaiBookCountQuery") => { + let works = filter_works_by_publishers(&state, &variables); + json!({ "data": { "bookCount": works.len() as i64 } }) + } + Some("OaiWorksQuery") => { + let works = + paginate_works(filter_works_by_publishers(&state, &variables), &variables); + json!({ "data": { "works": works } }) + } + Some("OaiBooksQuery") => { + let works = + paginate_works(filter_works_by_publishers(&state, &variables), &variables); + json!({ "data": { "books": works } }) + } + _ => json!({ "errors": [{ "message": "unsupported operation" }] }), + }; + + HttpResponse::Ok().json(response) + } + + async fn export_mock_handler( + state: web::Data, + work_id: web::Path, + ) -> HttpResponse { + let work_id = work_id.into_inner(); + if state.failing_work_ids.contains(&work_id) { + return HttpResponse::InternalServerError() + .content_type("text/plain; charset=utf-8") + .body("export failed"); + } + if state.malformed_work_ids.contains(&work_id) { + return HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(""); + } + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(format!( + r#" + + + 00000nam a2200000 i 4500 + {work_id} + +"# + )) + } + + fn request_operation_name(payload: &Value) -> Option { + payload + .get("operationName") + .and_then(Value::as_str) + .map(ToOwned::to_owned) + .or_else(|| { + let query = payload.get("query").and_then(Value::as_str)?; + [ + "OaiLatestWorksUpdatedQuery", + "OaiEarliestWorksUpdatedQuery", + "PublishersQuery", + "WorkQuery", + "OaiWorkCountQuery", + "OaiBookCountQuery", + "OaiWorksQuery", + "OaiBooksQuery", + ] + .iter() + .find(|name| query.contains(**name)) + .map(|name| (*name).to_string()) + }) + } + + fn find_work_by_id(state: &MockGraphqlState, work_id: &str) -> Option { + state + .works + .iter() + .find(|work| work.get("workId").and_then(Value::as_str) == Some(work_id)) + .cloned() + } + + fn filter_works_by_publishers(state: &MockGraphqlState, variables: &Value) -> Vec { + let Some(publishers) = variables.get("publishers") else { + return state.works.clone(); + }; + if publishers.is_null() { + return state.works.clone(); + } + let Some(ids) = publishers.as_array() else { + return state.works.clone(); + }; + if ids.is_empty() { + return Vec::new(); + } + let allowed_names = ids + .iter() + .filter_map(Value::as_str) + .filter_map(|publisher_id| { + state + .publishers + .iter() + .find(|publisher| { + publisher.get("publisherId").and_then(Value::as_str) == Some(publisher_id) + }) + .and_then(|publisher| publisher.get("publisherName").and_then(Value::as_str)) + .map(ToOwned::to_owned) + }) + .collect::>(); + state + .works + .iter() + .filter(|work| { + work.get("imprint") + .and_then(|imprint| imprint.get("publisher")) + .and_then(|publisher| publisher.get("publisherName")) + .and_then(Value::as_str) + .is_some_and(|publisher_name| allowed_names.contains(publisher_name)) + }) + .cloned() + .collect() + } + + fn paginate_works(works: Vec, variables: &Value) -> Vec { + let offset = variables.get("offset").and_then(Value::as_i64).unwrap_or(0); + let limit = variables + .get("limit") + .and_then(Value::as_i64) + .unwrap_or(PAGE_LIMIT); + works + .into_iter() + .skip(offset.max(0) as usize) + .take(limit.max(0) as usize) + .collect() + } + + fn mock_graphql_state(mut works: Vec) -> MockGraphqlState { + works.sort_by(|left, right| { + let left = left + .get("updatedAtWithRelations") + .and_then(Value::as_str) + .unwrap_or_default(); + let right = right + .get("updatedAtWithRelations") + .and_then(Value::as_str) + .unwrap_or_default(); + right.cmp(left) + }); + let latest = works + .first() + .and_then(|work| work.get("updatedAtWithRelations")) + .and_then(Value::as_str) + .unwrap_or("2024-12-31T00:00:00Z") + .to_string(); + let earliest = works + .last() + .and_then(|work| work.get("updatedAtWithRelations")) + .and_then(Value::as_str) + .unwrap_or("2024-01-01T00:00:00Z") + .to_string(); + + MockGraphqlState { + works, + publishers: vec![json!({ + "publisherId": PUBLISHER_ID, + "publisherName": PUBLISHER_NAME, + })], + latest, + earliest, + } + } + + fn make_work( + work_id: Uuid, + updated_at_with_relations: &str, + publisher_name: &str, + marc_eligible: bool, + include_xml_publication: bool, + ) -> Value { + let contributions = if marc_eligible { + vec![json!({ + "contributionType": "AUTHOR", + "firstName": "Ada", + "lastName": "Lovelace", + "fullName": "Ada Lovelace", + "mainContribution": true, + "biographies": [], + "contributionOrdinal": 1, + "contributor": { + "orcid": "https://orcid.org/0000-0002-0000-0001", + "website": null + }, + "affiliations": [] + })] + } else { + Vec::new() + }; + let languages = if marc_eligible { + vec![json!({ + "languageCode": "ENG", + "languageRelation": "ORIGINAL" + })] + } else { + Vec::new() + }; + let mut publications = vec![json!({ + "publicationId": Uuid::from_u128(work_id.as_u128() + 10), + "publicationType": "PDF", + "isbn": if marc_eligible { Value::String("978-1-4028-9462-6".to_string()) } else { Value::Null }, + "weightG": null, + "weightOz": null, + "widthMm": null, + "widthCm": null, + "widthIn": null, + "heightMm": null, + "heightCm": null, + "heightIn": null, + "depthMm": null, + "depthCm": null, + "depthIn": null, + "accessibilityStandard": null, + "accessibilityAdditionalStandard": null, + "accessibilityException": null, + "accessibilityReportUrl": null, + "prices": [], + "locations": [ + { + "landingPage": "https://example.org/book", + "fullTextUrl": "https://example.org/book.pdf", + "locationPlatform": "OTHER", + "canonical": true + } + ] + })]; + if include_xml_publication { + publications.push(json!({ + "publicationId": Uuid::from_u128(work_id.as_u128() + 11), + "publicationType": "XML", + "isbn": "978-92-95055-02-5", + "weightG": null, + "weightOz": null, + "widthMm": null, + "widthCm": null, + "widthIn": null, + "heightMm": null, + "heightCm": null, + "heightIn": null, + "depthMm": null, + "depthCm": null, + "depthIn": null, + "accessibilityStandard": null, + "accessibilityAdditionalStandard": null, + "accessibilityException": null, + "accessibilityReportUrl": null, + "prices": [], + "locations": [] + })); + } + + let mut work = serde_json::Map::new(); + work.insert("workId".to_string(), json!(work_id)); + work.insert( + "updatedAtWithRelations".to_string(), + json!(updated_at_with_relations), + ); + work.insert("workStatus".to_string(), json!("ACTIVE")); + work.insert("workType".to_string(), json!("MONOGRAPH")); + work.insert("reference".to_string(), Value::Null); + work.insert("edition".to_string(), json!(1)); + work.insert( + "doi".to_string(), + json!(format!("https://doi.org/10.00001/{work_id}")), + ); + work.insert("publicationDate".to_string(), json!("2024-01-01")); + work.insert("withdrawnDate".to_string(), Value::Null); + work.insert( + "license".to_string(), + json!("http://creativecommons.org/licenses/by/4.0/"), + ); + work.insert("copyrightHolder".to_string(), json!("Author")); + work.insert("generalNote".to_string(), Value::Null); + work.insert("bibliographyNote".to_string(), Value::Null); + work.insert("place".to_string(), json!("London")); + work.insert("pageCount".to_string(), json!(100)); + work.insert("pageBreakdown".to_string(), Value::Null); + work.insert("firstPage".to_string(), Value::Null); + work.insert("lastPage".to_string(), Value::Null); + work.insert("pageInterval".to_string(), Value::Null); + work.insert("imageCount".to_string(), Value::Null); + work.insert("tableCount".to_string(), Value::Null); + work.insert("audioCount".to_string(), Value::Null); + work.insert("videoCount".to_string(), Value::Null); + work.insert("landingPage".to_string(), json!("https://example.org/book")); + work.insert("toc".to_string(), Value::Null); + work.insert("lccn".to_string(), Value::Null); + work.insert("oclc".to_string(), Value::Null); + work.insert("coverUrl".to_string(), Value::Null); + work.insert("coverCaption".to_string(), Value::Null); + work.insert( + "titles".to_string(), + json!([{ + "titleId": Uuid::from_u128(work_id.as_u128() + 1), + "localeCode": "EN", + "fullTitle": "Sample Title", + "title": "Sample Title", + "subtitle": null, + "canonical": true + }]), + ); + work.insert("abstracts".to_string(), json!([])); + work.insert( + "imprint".to_string(), + json!({ + "imprintName": "Imprint", + "imprintUrl": null, + "crossmarkDoi": null, + "defaultCurrency": "EUR", + "defaultPlace": "London", + "defaultLocale": "EN", + "publisher": { + "publisherName": publisher_name, + "publisherShortname": "OAP", + "publisherUrl": null, + "accessibilityStatement": null, + "contacts": [] + } + }), + ); + work.insert("issues".to_string(), json!([])); + work.insert("contributions".to_string(), json!(contributions)); + work.insert("languages".to_string(), json!(languages)); + work.insert("publications".to_string(), json!(publications)); + work.insert("subjects".to_string(), json!([])); + work.insert("fundings".to_string(), json!([])); + work.insert("relations".to_string(), json!([])); + work.insert("references".to_string(), json!([])); + Value::Object(work) + } + + fn make_descending_work_series(count: usize) -> Vec { + let base_date = NaiveDate::from_ymd_opt(2024, 12, 31).expect("valid base date"); + (0..count) + .map(|index| { + let updated_at = (base_date - Duration::days(index as i64)) + .format("%Y-%m-%dT12:00:00Z") + .to_string(); + let work_id = + Uuid::from_u128(0x1000_0000_0000_0000_0000_0000_0000_0000 + index as u128); + make_work(work_id, &updated_at, PUBLISHER_NAME, true, true) + }) + .collect() + } + + fn normalize_response_date(xml: &str) -> String { + let open = ""; + let close = ""; + let Some(start) = xml.find(open) else { + return xml.to_string(); + }; + let value_start = start + open.len(); + let Some(value_end_rel) = xml[value_start..].find(close) else { + return xml.to_string(); + }; + let value_end = value_start + value_end_rel; + let mut normalized = String::new(); + normalized.push_str(&xml[..value_start]); + normalized.push_str("RESPONSE_DATE"); + normalized.push_str(&xml[value_end..]); + normalized + } + + fn request_opening_tag(xml: &str) -> String { + let start = xml.find("') + .map(|offset| start + offset) + .expect("request closing bracket"); + xml[start..=end].to_string() + } + + fn extract_resumption_token(xml: &str) -> Option { + let token_start = xml.find("')? + 1; + let content_end = content_start + xml[content_start..].find("")?; + let value = xml[content_start..content_end].trim(); + if value.is_empty() { + None + } else { + Some(value.to_string()) + } + } + + fn count_occurrences(haystack: &str, needle: &str) -> usize { + haystack.matches(needle).count() + } + + #[actix_web::test] + async fn get_and_post_are_equivalent_for_all_oai_verbs() { + let work_id = Uuid::from_u128(1); + let works = vec![make_work( + work_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service( + web::resource("/oai") + .route(web::get().to(oai_get)) + .route(web::post().to(oai_post)), + ), + ) + .await; + + let identifier = OaiService::oai_identifier(work_id); + let cases = vec![ + "verb=Identify".to_string(), + "verb=ListMetadataFormats".to_string(), + "verb=ListSets".to_string(), + format!("verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc"), + "verb=ListIdentifiers&metadataPrefix=oai_dc".to_string(), + "verb=ListRecords&metadataPrefix=oai_dc".to_string(), + ]; + + for case in cases { + let get_req = test::TestRequest::get() + .uri(&format!("/oai?{case}")) + .to_request(); + let get_response = test::call_service(&app, get_req).await; + assert_eq!(get_response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + get_response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .expect("GET content type"), + "text/xml; charset=utf-8" + ); + let get_body = String::from_utf8(test::read_body(get_response).await.to_vec()) + .expect("GET body UTF-8"); + + let post_req = test::TestRequest::post() + .uri("/oai") + .insert_header((header::CONTENT_TYPE, "application/x-www-form-urlencoded")) + .set_payload(case.clone()) + .to_request(); + let post_response = test::call_service(&app, post_req).await; + assert_eq!(post_response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + post_response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .expect("POST content type"), + "text/xml; charset=utf-8" + ); + let post_body = String::from_utf8(test::read_body(post_response).await.to_vec()) + .expect("POST body UTF-8"); + + assert_eq!( + normalize_response_date(&get_body), + normalize_response_date(&post_body) + ); + } + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn repeated_arguments_return_bad_argument() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(1))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service( + web::resource("/oai") + .route(web::get().to(oai_get)) + .route(web::post().to(oai_post)), + ), + ) + .await; + + let get_req = test::TestRequest::get() + .uri("/oai?verb=Identify&verb=ListSets") + .to_request(); + let get_response = test::call_service(&app, get_req).await; + let get_body = String::from_utf8(test::read_body(get_response).await.to_vec()) + .expect("GET body UTF-8"); + assert!(get_body.contains("")); + + let post_req = test::TestRequest::post() + .uri("/oai?verb=Identify") + .insert_header((header::CONTENT_TYPE, "application/x-www-form-urlencoded")) + .set_payload("verb=ListSets") + .to_request(); + let post_response = test::call_service(&app, post_req).await; + let post_body = String::from_utf8(test::read_body(post_response).await.to_vec()) + .expect("POST body UTF-8"); + assert!(post_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn request_attributes_are_omitted_for_bad_verb_and_bad_argument() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(1))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let bad_verb_req = test::TestRequest::get() + .uri("/oai?verb=UnknownVerb") + .to_request(); + let bad_verb_response = test::call_service(&app, bad_verb_req).await; + let bad_verb_body = String::from_utf8(test::read_body(bad_verb_response).await.to_vec()) + .expect("badVerb body UTF-8"); + assert!(bad_verb_body.contains("")); + assert_eq!(request_opening_tag(&bad_verb_body), ""); + + let bad_argument_req = test::TestRequest::get() + .uri("/oai?verb=Identify&foo=bar") + .to_request(); + let bad_argument_response = test::call_service(&app, bad_argument_req).await; + let bad_argument_body = + String::from_utf8(test::read_body(bad_argument_response).await.to_vec()) + .expect("badArgument body UTF-8"); + assert!(bad_argument_body.contains("")); + assert_eq!(request_opening_tag(&bad_argument_body), ""); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_sets_rejects_resumption_tokens() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(1))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=ListSets&resumptionToken=abc") + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + + assert!(body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_metadata_formats_is_identifier_aware() { + let marc_eligible_id = Uuid::from_u128(10); + let marc_ineligible_id = Uuid::from_u128(11); + let works = vec![ + make_work( + marc_eligible_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + marc_ineligible_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + false, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let eligible_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListMetadataFormats&identifier={}", + OaiService::oai_identifier(marc_eligible_id) + )) + .to_request(); + let eligible_response = test::call_service(&app, eligible_req).await; + let eligible_body = String::from_utf8(test::read_body(eligible_response).await.to_vec()) + .expect("eligible body UTF-8"); + assert!(eligible_body.contains("oai_dc")); + assert!(eligible_body.contains("oai_openaire")); + assert!(eligible_body.contains("marcxml")); + + let ineligible_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListMetadataFormats&identifier={}", + OaiService::oai_identifier(marc_ineligible_id) + )) + .to_request(); + let ineligible_response = test::call_service(&app, ineligible_req).await; + let ineligible_body = + String::from_utf8(test::read_body(ineligible_response).await.to_vec()) + .expect("ineligible body UTF-8"); + assert!(ineligible_body.contains("oai_dc")); + assert!(ineligible_body.contains("oai_openaire")); + assert!(!ineligible_body.contains("marcxml")); + + let invalid_identifier_req = test::TestRequest::get() + .uri( + "/oai?verb=ListMetadataFormats&identifier=oai:example.org:00000000-0000-0000-0000-000000000001", + ) + .to_request(); + let invalid_identifier_response = test::call_service(&app, invalid_identifier_req).await; + let invalid_identifier_body = + String::from_utf8(test::read_body(invalid_identifier_response).await.to_vec()) + .expect("invalid identifier body UTF-8"); + assert!(invalid_identifier_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn marc_export_failures_are_mapped_to_cannot_disseminate_format() { + let work_id = Uuid::from_u128(20); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state.failing_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=GetRecord&identifier={}&metadataPrefix=marcxml", + OaiService::oai_identifier(work_id) + )) + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + assert!(body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_identifiers_validates_datestamp_arguments() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(3))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let mismatch_req = test::TestRequest::get() + .uri( + "/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-01-01&until=2024-01-01T00:00:00Z", + ) + .to_request(); + let mismatch_response = test::call_service(&app, mismatch_req).await; + let mismatch_body = String::from_utf8(test::read_body(mismatch_response).await.to_vec()) + .expect("mismatch body UTF-8"); + assert!(mismatch_body.contains("")); + + let invalid_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=20240101") + .to_request(); + let invalid_response = test::call_service(&app, invalid_req).await; + let invalid_body = String::from_utf8(test::read_body(invalid_response).await.to_vec()) + .expect("invalid body UTF-8"); + assert!(invalid_body.contains("")); + + let reversed_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-12-31&until=2024-01-01") + .to_request(); + let reversed_response = test::call_service(&app, reversed_req).await; + let reversed_body = String::from_utf8(test::read_body(reversed_response).await.to_vec()) + .expect("reversed body UTF-8"); + assert!(reversed_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_identifiers_applies_date_filters_and_reports_no_records_match() { + let works = vec![ + make_work( + Uuid::from_u128(30), + "2024-03-01T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + Uuid::from_u128(31), + "2024-02-01T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + Uuid::from_u128(32), + "2023-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let from_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-01-01") + .to_request(); + let from_response = test::call_service(&app, from_req).await; + let from_body = String::from_utf8(test::read_body(from_response).await.to_vec()) + .expect("from body UTF-8"); + assert_eq!(count_occurrences(&from_body, "
    "), 2); + + let until_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&until=2024-01-31") + .to_request(); + let until_response = test::call_service(&app, until_req).await; + let until_body = String::from_utf8(test::read_body(until_response).await.to_vec()) + .expect("until body UTF-8"); + assert_eq!(count_occurrences(&until_body, "
    "), 1); + + let range_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-01-01&until=2024-02-15") + .to_request(); + let range_response = test::call_service(&app, range_req).await; + let range_body = String::from_utf8(test::read_body(range_response).await.to_vec()) + .expect("range body UTF-8"); + assert_eq!(count_occurrences(&range_body, "
    "), 1); + + let no_match_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2030-01-01") + .to_request(); + let no_match_response = test::call_service(&app, no_match_req).await; + let no_match_body = String::from_utf8(test::read_body(no_match_response).await.to_vec()) + .expect("no match body UTF-8"); + assert!(no_match_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn resumption_tokens_support_filters_backward_compatibility_and_terminal_token() { + let works = make_descending_work_series(60); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let filtered_first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-11-10&until=2024-12-31") + .to_request(); + let filtered_first_response = test::call_service(&app, filtered_first_req).await; + let filtered_first_body = + String::from_utf8(test::read_body(filtered_first_response).await.to_vec()) + .expect("first filtered body UTF-8"); + assert_eq!(count_occurrences(&filtered_first_body, "
    "), 50); + assert!(filtered_first_body.contains("")); + assert!(!filtered_first_body.contains("completeListSize=\"")); + + let filtered_token = + extract_resumption_token(&filtered_first_body).expect("filtered resumption token"); + let decoded_filtered = OaiService::decode_resumption_token(&filtered_token) + .expect("decode filtered resumption token"); + assert_eq!(decoded_filtered.from.as_deref(), Some("2024-11-10")); + assert_eq!(decoded_filtered.until.as_deref(), Some("2024-12-31")); + assert_eq!( + decoded_filtered.granularity, + Some(DatestampGranularity::Day) + ); + assert!(decoded_filtered.scan_offset.is_some()); + + let filtered_second_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={filtered_token}" + )) + .to_request(); + let filtered_second_response = test::call_service(&app, filtered_second_req).await; + let filtered_second_body = + String::from_utf8(test::read_body(filtered_second_response).await.to_vec()) + .expect("second filtered body UTF-8"); + assert_eq!(count_occurrences(&filtered_second_body, "
    "), 2); + assert!(filtered_second_body.contains("")); + + let unfiltered_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc") + .to_request(); + let unfiltered_response = test::call_service(&app, unfiltered_req).await; + let unfiltered_body = + String::from_utf8(test::read_body(unfiltered_response).await.to_vec()) + .expect("unfiltered body UTF-8"); + assert!(unfiltered_body.contains("completeListSize=\"60\"")); + + let legacy_token = URL_SAFE_NO_PAD.encode( + serde_json::to_vec(&json!({ + "offset": 0, + "metadata_prefix": "OaiDc", + "set": null, + "identifiers_only": true + })) + .expect("legacy token serialize"), + ); + let legacy_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={legacy_token}" + )) + .to_request(); + let legacy_response = test::call_service(&app, legacy_req).await; + let legacy_body = String::from_utf8(test::read_body(legacy_response).await.to_vec()) + .expect("legacy response body UTF-8"); + assert!(legacy_body.contains("")); + + let malformed_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&resumptionToken=not-a-token") + .to_request(); + let malformed_response = test::call_service(&app, malformed_req).await; + let malformed_body = String::from_utf8(test::read_body(malformed_response).await.to_vec()) + .expect("malformed response body UTF-8"); + assert!(malformed_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } +} diff --git a/thoth-oai-server/src/metadata.rs b/thoth-oai-server/src/metadata.rs index b96ee42cc..435f4a659 100644 --- a/thoth-oai-server/src/metadata.rs +++ b/thoth-oai-server/src/metadata.rs @@ -161,7 +161,7 @@ fn publication_type_value(publication_type: &PublicationType) -> &'static str { PublicationType::PAPERBACK => "paperback", PublicationType::PDF => "application/pdf", PublicationType::EPUB => "application/epub+zip", - PublicationType::XML => "application/xml", + PublicationType::XML => "text/xml", PublicationType::HTML => "text/html", PublicationType::DOCX => { "application/vnd.openxmlformats-officedocument.wordprocessingml.document" @@ -729,3 +729,13 @@ pub(crate) fn map_oai_openaire(work: &Work) -> ThothResult { xml.push_str(""); Ok(xml) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn xml_publication_type_maps_to_text_xml() { + assert_eq!(publication_type_value(&PublicationType::XML), "text/xml"); + } +} diff --git a/thoth-oai-server/src/service.rs b/thoth-oai-server/src/service.rs index 6605c7046..f30af05f1 100644 --- a/thoth-oai-server/src/service.rs +++ b/thoth-oai-server/src/service.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; +use chrono::{DateTime, NaiveDate, Utc}; use quick_xml::{events::Event, Reader, Writer}; use reqwest::Client; use serde::{Deserialize, Serialize}; @@ -30,12 +31,22 @@ pub(crate) enum MetadataPrefix { MarcXml, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) enum DatestampGranularity { + Day, + Second, +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub(crate) struct ResumptionToken { pub offset: i64, pub metadata_prefix: MetadataPrefix, pub set: Option, pub identifiers_only: bool, + pub from: Option, + pub until: Option, + pub granularity: Option, + pub scan_offset: Option, } #[derive(Debug, Clone)] @@ -49,8 +60,15 @@ pub(crate) struct SetRecord { pub(crate) struct RecordPage { pub records: Vec, pub cursor: i64, - pub complete_list_size: i64, + pub complete_list_size: Option, pub next_token: Option, + pub terminal_resumption_token: bool, +} + +#[derive(Debug, Clone)] +struct DatestampBounds { + from: Option, + until: Option, } impl MetadataPrefix { @@ -143,89 +161,93 @@ impl OaiService { pub(crate) async fn list_records( &self, - metadata_prefix: MetadataPrefix, - set: Option, - offset: i64, - identifiers_only: bool, + token: &ResumptionToken, + resumed: bool, ) -> ThothResult { - let set_record = self.find_set(set.as_deref()).await?; + let set_record = self.find_set(token.set.as_deref()).await?; let publishers = set_record .as_ref() .map(|set_record| vec![set_record.publisher_id]); - let cursor = offset; + let bounds = Self::build_datestamp_bounds(token)?; + let date_filter_active = bounds.is_some(); + let cursor = token.scan_offset.unwrap_or(token.offset); - if metadata_prefix == MetadataPrefix::MarcXml { - let total = self - .thoth_client + let total = if token.metadata_prefix == MetadataPrefix::MarcXml { + self.thoth_client .get_oai_book_count(publishers.clone()) - .await?; - let mut records = Vec::new(); - let mut raw_offset = offset; + .await? + } else { + self.thoth_client + .get_oai_work_count(publishers.clone()) + .await? + }; - while raw_offset < total && records.len() < PAGE_LIMIT as usize { - let batch = self - .thoth_client + let mut records = Vec::new(); + let mut raw_offset = cursor; + while raw_offset < total && records.len() < PAGE_LIMIT as usize { + let batch = if token.metadata_prefix == MetadataPrefix::MarcXml { + self.thoth_client .get_oai_books( publishers.clone(), PAGE_LIMIT, raw_offset, Self::query_parameters(), ) - .await?; - if batch.is_empty() { - break; + .await? + } else { + self.thoth_client + .get_oai_works( + publishers.clone(), + PAGE_LIMIT, + raw_offset, + Self::query_parameters(), + ) + .await? + }; + + if batch.is_empty() { + break; + } + + raw_offset += batch.len() as i64; + + for work in batch { + if token.metadata_prefix == MetadataPrefix::MarcXml + && !Self::is_marcxml_record_candidate(&work) + { + continue; } - raw_offset += batch.len() as i64; - for work in batch { - if Self::is_marcxml_record_candidate(&work) { - records.push(work); - if records.len() == PAGE_LIMIT as usize { - break; - } - } + if !Self::matches_datestamp_filter(work.updated_at_with_relations, bounds.as_ref())? + { + continue; + } + records.push(work); + if records.len() == PAGE_LIMIT as usize { + break; } } - - let next_token = (raw_offset < total && !records.is_empty()).then(|| { - Self::encode_resumption_token(ResumptionToken { - offset: raw_offset, - metadata_prefix, - set, - identifiers_only, - }) - }); - - return Ok(RecordPage { - records, - cursor, - complete_list_size: total, - next_token, - }); } - let total = self - .thoth_client - .get_oai_work_count(publishers.clone()) - .await?; - let records = self - .thoth_client - .get_oai_works(publishers, PAGE_LIMIT, offset, Self::query_parameters()) - .await?; - let next_offset = offset + records.len() as i64; - let next_token = (next_offset < total && !records.is_empty()).then(|| { + let next_token = (raw_offset < total && !records.is_empty()).then(|| { Self::encode_resumption_token(ResumptionToken { - offset: next_offset, - metadata_prefix, - set, - identifiers_only, + offset: raw_offset, + metadata_prefix: token.metadata_prefix, + set: token.set.clone(), + identifiers_only: token.identifiers_only, + from: token.from.clone(), + until: token.until.clone(), + granularity: token.granularity, + scan_offset: Some(raw_offset), }) }); + let terminal_resumption_token = resumed && next_token.is_none() && !records.is_empty(); Ok(RecordPage { records, cursor, - complete_list_size: total, + complete_list_size: (!date_filter_active).then_some(total), next_token, + terminal_resumption_token, }) } @@ -263,8 +285,7 @@ impl OaiService { pub(crate) fn parse_oai_identifier(identifier: &str) -> ThothResult { identifier - .rsplit_once(':') - .map(|(_, value)| value) + .strip_prefix(&format!("{RECORD_PREFIX}:")) .ok_or(ThothError::InvalidUuid) .and_then(|value| Uuid::parse_str(value).map_err(|_| ThothError::InvalidUuid)) } @@ -462,6 +483,84 @@ impl OaiService { } } } + + fn build_datestamp_bounds(token: &ResumptionToken) -> ThothResult> { + if token.from.is_none() && token.until.is_none() { + return Ok(None); + } + + let granularity = token.granularity.ok_or_else(|| { + ThothError::RequestError("badResumptionToken: missing date granularity".to_string()) + })?; + + let from = token + .from + .as_deref() + .map(|value| Self::parse_datestamp_boundary(value, granularity, true)) + .transpose()?; + let until = token + .until + .as_deref() + .map(|value| Self::parse_datestamp_boundary(value, granularity, false)) + .transpose()?; + + Ok(Some(DatestampBounds { from, until })) + } + + fn parse_datestamp_boundary( + value: &str, + granularity: DatestampGranularity, + is_from: bool, + ) -> ThothResult { + match granularity { + DatestampGranularity::Day => { + let date = NaiveDate::parse_from_str(value, "%Y-%m-%d").map_err(|_| { + ThothError::RequestError( + "badResumptionToken: invalid day datestamp".to_string(), + ) + })?; + let value = if is_from { + format!("{date}T00:00:00Z") + } else { + format!("{date}T23:59:59Z") + }; + Timestamp::parse_from_rfc3339(&value) + } + DatestampGranularity::Second => { + let datetime = + DateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%SZ").map_err(|_| { + ThothError::RequestError( + "badResumptionToken: invalid second datestamp".to_string(), + ) + })?; + let canonical = datetime + .with_timezone(&Utc) + .format("%Y-%m-%dT%H:%M:%SZ") + .to_string(); + Timestamp::parse_from_rfc3339(&canonical) + } + } + } + + fn matches_datestamp_filter( + datestamp: Timestamp, + bounds: Option<&DatestampBounds>, + ) -> ThothResult { + let Some(bounds) = bounds else { + return Ok(true); + }; + if let Some(from) = bounds.from { + if datestamp < from { + return Ok(false); + } + } + if let Some(until) = bounds.until { + if datestamp > until { + return Ok(false); + } + } + Ok(true) + } } #[cfg(test)] @@ -498,6 +597,10 @@ mod tests { metadata_prefix: MetadataPrefix::MarcXml, set: Some("open-book-publishers".to_string()), identifiers_only: true, + from: Some("2024-01-01".to_string()), + until: Some("2024-12-31".to_string()), + granularity: Some(DatestampGranularity::Day), + scan_offset: Some(200), }; let encoded = OaiService::encode_resumption_token(token.clone()); @@ -524,4 +627,31 @@ mod tests { assert!(record.contains("123")); assert!(!record.contains(" Date: Mon, 13 Apr 2026 17:25:17 +0100 Subject: [PATCH 03/19] Cursor harderning; compression support; retry-after --- Cargo.lock | 1 + src/bin/arguments/mod.rs | 11 + src/bin/commands/start.rs | 3 + thoth-oai-server/Cargo.toml | 1 + thoth-oai-server/src/lib.rs | 547 +++++++++++++++++++++++++++++--- thoth-oai-server/src/service.rs | 118 +++++-- 6 files changed, 606 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 93dedf2eb..0604db07c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5114,6 +5114,7 @@ dependencies = [ "base64 0.22.1", "chrono", "env_logger", + "flate2", "log", "quick-xml", "reqwest", diff --git a/src/bin/arguments/mod.rs b/src/bin/arguments/mod.rs index 98d939b35..64df0eed2 100644 --- a/src/bin/arguments/mod.rs +++ b/src/bin/arguments/mod.rs @@ -96,6 +96,17 @@ pub fn oai_url() -> Arg { .num_args(1) } +pub fn oai_retry_after_seconds() -> Arg { + Arg::new("oai-retry-after-seconds") + .long("oai-retry-after-seconds") + .value_name("OAI_API_RETRY_AFTER_SECONDS") + .env("OAI_API_RETRY_AFTER_SECONDS") + .default_value("30") + .help("Retry-After value in seconds used for transient upstream OAI failures") + .num_args(1) + .value_parser(value_parser!(u64)) +} + pub fn zitadel_url() -> Arg { Arg::new("zitadel-url") .short('z') diff --git a/src/bin/commands/start.rs b/src/bin/commands/start.rs index 4061b42f6..73d1a9dc4 100644 --- a/src/bin/commands/start.rs +++ b/src/bin/commands/start.rs @@ -43,6 +43,7 @@ lazy_static! { .arg(arguments::keep_alive("OAI_API_KEEP_ALIVE")) .arg(arguments::oai_url()) .arg(arguments::gql_endpoint()) + .arg(arguments::oai_retry_after_seconds()) .arg(arguments::export_url()), ); } @@ -120,6 +121,7 @@ pub fn oai_api(arguments: &ArgMatches) -> ThothResult<()> { .get_one::("gql-endpoint") .unwrap() .to_owned(); + let retry_after_seconds = *arguments.get_one::("oai-retry-after-seconds").unwrap(); let export_url = arguments .get_one::("export-url") .unwrap() @@ -133,6 +135,7 @@ pub fn oai_api(arguments: &ArgMatches) -> ThothResult<()> { public_url, gql_endpoint, export_url, + retry_after_seconds, ) .map_err(|e| e.into()) } diff --git a/thoth-oai-server/Cargo.toml b/thoth-oai-server/Cargo.toml index 9ebcaa6e5..dfdb7878c 100644 --- a/thoth-oai-server/Cargo.toml +++ b/thoth-oai-server/Cargo.toml @@ -17,6 +17,7 @@ actix-web = "4.10" base64 = "0.22.1" chrono = { version = "0.4.40", features = ["serde"] } env_logger = "0.11.7" +flate2 = "1.1.1" log = "0.4.26" quick-xml = "0.36" reqwest = { version = "0.12", features = ["json"] } diff --git a/thoth-oai-server/src/lib.rs b/thoth-oai-server/src/lib.rs index fbbf2d72e..241bf9db9 100644 --- a/thoth-oai-server/src/lib.rs +++ b/thoth-oai-server/src/lib.rs @@ -1,11 +1,18 @@ mod metadata; mod service; -use std::{collections::HashMap, io, time::Duration}; +use std::{ + collections::HashMap, + io::{self, Write}, + time::Duration, +}; use actix_cors::Cors; -use actix_web::{middleware::Logger, web, App, HttpRequest, HttpResponse, HttpServer}; +use actix_web::{ + http::header, middleware::Logger, web, App, HttpRequest, HttpResponse, HttpServer, +}; use chrono::{DateTime, NaiveDate, Utc}; +use flate2::{write::GzEncoder, Compression}; use quick_xml::escape::escape; use service::{ DatestampGranularity, MetadataPrefix, OaiService, RecordPage, ResumptionToken, ADMIN_EMAIL, @@ -16,10 +23,13 @@ use uuid::Uuid; const LOG_FORMAT: &str = r#"%{r}a %a "%r" %s %b "%{Referer}i" "%{User-Agent}i" %T"#; const XSL_STYLESHEET: &str = include_str!("../assets/oai2.xsl"); +#[cfg(test)] +const DEFAULT_RETRY_AFTER_SECONDS: u64 = 30; #[derive(Clone)] struct AppState { service: OaiService, + retry_after_seconds: u64, } #[derive(Debug)] @@ -79,12 +89,15 @@ async fn oai_post( match std::str::from_utf8(&body) { Ok(body) => params.merge(parse_form_encoded(body)), Err(_) => { - return xml_response(error_document( - &state.service, - ¶ms.values, - "badArgument", - "Invalid UTF-8 request body", - )) + return xml_response( + &request, + error_document( + &state.service, + ¶ms.values, + "badArgument", + "Invalid UTF-8 request body", + ), + ) } } oai_with_params(request, params, state).await @@ -96,30 +109,73 @@ async fn oai_with_params( state: web::Data, ) -> HttpResponse { if params.has_repeated { - return xml_response(error_document( - &state.service, - ¶ms.values, - "badArgument", - "The request includes repeated arguments", - )); + return xml_response( + &request, + error_document( + &state.service, + ¶ms.values, + "badArgument", + "The request includes repeated arguments", + ), + ); } match handle_oai_request(&request, ¶ms.values, &state.service).await { - Ok(body) => xml_response(success_document(&state.service, ¶ms.values, &body)), - Err(HandlerError::Protocol(error)) => xml_response(error_document( - &state.service, - ¶ms.values, - error.code, - &error.message, - )), + Ok(body) => xml_response( + &request, + success_document(&state.service, ¶ms.values, &body), + ), + Err(HandlerError::Protocol(error)) => xml_response( + &request, + error_document(&state.service, ¶ms.values, error.code, &error.message), + ), Err(HandlerError::Internal(error)) => { log::error!("OAI request failed: {error}"); - HttpResponse::InternalServerError() - .content_type("text/plain; charset=utf-8") - .body("Internal Server Error") + if is_transient_upstream_error(&error) { + transient_service_unavailable(state.retry_after_seconds) + } else { + HttpResponse::InternalServerError() + .content_type("text/plain; charset=utf-8") + .body("Internal Server Error") + } } } } +fn is_transient_upstream_error(error: &ThothError) -> bool { + let message = match error { + ThothError::RequestError(message) | ThothError::GraphqlError(message) => { + message.to_ascii_lowercase() + } + _ => return false, + }; + + let has_transient_status = [500, 502, 503, 504, 429].iter().any(|status| { + message.contains(&format!("graphql {status}")) + || message.contains(&format!("export {status}")) + }); + let has_network_failure = [ + "timed out", + "timeout", + "connection refused", + "connection reset", + "error sending request", + "temporary failure", + "dns error", + "failed to lookup address", + ] + .iter() + .any(|needle| message.contains(needle)); + + has_transient_status || has_network_failure +} + +fn transient_service_unavailable(retry_after_seconds: u64) -> HttpResponse { + HttpResponse::ServiceUnavailable() + .insert_header((header::RETRY_AFTER, retry_after_seconds.to_string())) + .content_type("text/plain; charset=utf-8") + .body("Service Unavailable") +} + impl ParsedParams { fn merge(&mut self, other: ParsedParams) { self.has_repeated = self.has_repeated || other.has_repeated; @@ -247,7 +303,7 @@ async fn handle_oai_request( .list_records(&parsed.token, parsed.resumed) .await .map_err(map_list_error)?; - if page.records.is_empty() { + if page.records.is_empty() && !parsed.resumed { return Err(HandlerError::Protocol(no_records_match())); } Ok(render_list_identifiers(&page)) @@ -259,7 +315,7 @@ async fn handle_oai_request( .list_records(&parsed.token, parsed.resumed) .await .map_err(map_list_error)?; - if page.records.is_empty() { + if page.records.is_empty() && !parsed.resumed { return Err(HandlerError::Protocol(no_records_match())); } Ok(render_list_records(service, &page, parsed.token.metadata_prefix).await?) @@ -284,6 +340,7 @@ fn render_identify( {}\ no\ YYYY-MM-DDThh:mm:ssZ\ +gzip\ \ \ oai\ @@ -394,20 +451,10 @@ async fn render_record_xml( MetadataPrefix::OaiOpenaire => { metadata::map_oai_openaire(work).map_err(HandlerError::Internal)? } - MetadataPrefix::MarcXml => { - service - .get_marcxml_record(work.work_id) - .await - .map_err(|_| { - HandlerError::Protocol(ProtocolError { - code: "cannotDisseminateFormat", - message: format!( - "Record cannot be disseminated as {}", - metadata_prefix.as_str() - ), - }) - })? - } + MetadataPrefix::MarcXml => service + .get_marcxml_record(work.work_id) + .await + .map_err(map_marcxml_error(metadata_prefix))?, }; Ok(format!( @@ -495,6 +542,9 @@ fn parse_list_token( if token.scan_offset.is_none() { token.scan_offset = Some(token.offset); } + if token.returned_count.is_none() { + token.returned_count = Some(token.offset); + } return Ok(ParsedListRequest { token, resumed: true, @@ -520,6 +570,7 @@ fn parse_list_token( until, granularity, scan_offset: Some(0), + returned_count: Some(0), }, resumed: false, }) @@ -677,6 +728,23 @@ fn map_get_record_error( } } +fn map_marcxml_error( + metadata_prefix: MetadataPrefix, +) -> impl Fn(ThothError) -> HandlerError + Copy { + move |error| match error { + ThothError::RequestError(_) if is_transient_upstream_error(&error) => { + HandlerError::Internal(error) + } + _ => HandlerError::Protocol(ProtocolError { + code: "cannotDisseminateFormat", + message: format!( + "Record cannot be disseminated as {}", + metadata_prefix.as_str() + ), + }), + } +} + fn map_list_error(error: ThothError) -> HandlerError { match error { ThothError::EntityNotFound => no_records_match().into(), @@ -797,13 +865,63 @@ fn push_text_element(xml: &mut String, name: &str, text: &str) { xml.push('>'); } -fn xml_response(body: String) -> HttpResponse { +fn xml_response(request: &HttpRequest, body: String) -> HttpResponse { + if request_accepts_gzip(request) { + let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); + if encoder.write_all(body.as_bytes()).is_err() { + return HttpResponse::InternalServerError() + .content_type("text/plain; charset=utf-8") + .body("Internal Server Error"); + } + match encoder.finish() { + Ok(compressed_body) => { + return HttpResponse::Ok() + .insert_header((header::CONTENT_ENCODING, "gzip")) + .content_type("text/xml; charset=utf-8") + .body(compressed_body); + } + Err(_) => { + return HttpResponse::InternalServerError() + .content_type("text/plain; charset=utf-8") + .body("Internal Server Error"); + } + } + } + HttpResponse::Ok() .content_type("text/xml; charset=utf-8") .body(body) } +fn request_accepts_gzip(request: &HttpRequest) -> bool { + let Some(value) = request.headers().get(header::ACCEPT_ENCODING) else { + return false; + }; + let Ok(value) = value.to_str() else { + return false; + }; + + value.split(',').any(|item| { + let mut parts = item.trim().split(';'); + let coding = parts + .next() + .map(str::trim) + .unwrap_or_default() + .to_ascii_lowercase(); + if coding != "gzip" && coding != "*" { + return false; + } + let quality = parts + .map(str::trim) + .find_map(|part| part.strip_prefix("q=")) + .and_then(|value| value.parse::().ok()) + .unwrap_or(1.0); + quality > 0.0 + }) +} + #[actix_web::main] +#[allow(clippy::too_many_arguments)] pub async fn start_server( host: String, port: String, @@ -812,10 +930,12 @@ pub async fn start_server( public_url: String, gql_endpoint: String, export_url: String, + retry_after_seconds: u64, ) -> io::Result<()> { env_logger::init_from_env(env_logger::Env::new().default_filter_or("info")); let state = AppState { service: OaiService::new(public_url, gql_endpoint, export_url), + retry_after_seconds, }; HttpServer::new(move || { @@ -846,8 +966,9 @@ mod tests { use actix_web::{dev::ServerHandle, http::header, test, App, HttpResponse, HttpServer}; use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; use chrono::{Duration, NaiveDate}; + use flate2::read::GzDecoder; use serde_json::{json, Value}; - use std::{collections::HashSet, net::TcpListener}; + use std::{collections::HashSet, io::Read, net::TcpListener}; const PUBLISHER_ID: &str = "00000000-0000-0000-1111-000000000001"; const PUBLISHER_NAME: &str = "Open Access Press"; @@ -899,6 +1020,32 @@ mod tests { } } + async fn spawn_graphql_error_server(status: actix_web::http::StatusCode) -> RunningMockServer { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind graphql error server"); + let address = listener.local_addr().expect("graphql error local address"); + + let server = HttpServer::new(move || { + App::new().route( + "/graphql", + web::post().to(move || async move { + HttpResponse::build(status) + .content_type("application/json; charset=utf-8") + .body(r#"{"errors":[{"message":"upstream failure"}]}"#) + }), + ) + }) + .listen(listener) + .expect("listen graphql error server") + .run(); + let handle = server.handle(); + actix_web::rt::spawn(server); + + RunningMockServer { + base_url: format!("http://{address}"), + handle, + } + } + async fn spawn_export_server(state: MockExportState) -> RunningMockServer { let listener = TcpListener::bind("127.0.0.1:0").expect("bind export mock server"); let address = listener.local_addr().expect("export local address"); @@ -1372,6 +1519,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service( web::resource("/oai") @@ -1405,6 +1553,10 @@ mod tests { .expect("GET content type"), "text/xml; charset=utf-8" ); + assert!(get_response + .headers() + .get(header::CONTENT_ENCODING) + .is_none()); let get_body = String::from_utf8(test::read_body(get_response).await.to_vec()) .expect("GET body UTF-8"); @@ -1423,9 +1575,18 @@ mod tests { .expect("POST content type"), "text/xml; charset=utf-8" ); + assert!(post_response + .headers() + .get(header::CONTENT_ENCODING) + .is_none()); let post_body = String::from_utf8(test::read_body(post_response).await.to_vec()) .expect("POST body UTF-8"); + if case == "verb=Identify" { + assert!(get_body.contains("gzip")); + assert!(post_body.contains("gzip")); + } + assert_eq!( normalize_response_date(&get_body), normalize_response_date(&post_body) @@ -1450,6 +1611,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service( web::resource("/oai") @@ -1495,6 +1657,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1537,6 +1700,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1585,6 +1749,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1633,7 +1798,7 @@ mod tests { } #[actix_web::test] - async fn marc_export_failures_are_mapped_to_cannot_disseminate_format() { + async fn marc_export_parse_failures_are_mapped_to_cannot_disseminate_format() { let work_id = Uuid::from_u128(20); let works = vec![make_work( work_id, @@ -1645,7 +1810,7 @@ mod tests { let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; let mut export_state = MockExportState::default(); - export_state.failing_work_ids.insert(work_id); + export_state.malformed_work_ids.insert(work_id); let export_server = spawn_export_server(export_state).await; let app = test::init_service( @@ -1656,6 +1821,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1689,6 +1855,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1760,6 +1927,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1815,6 +1983,7 @@ mod tests { format!("{}/graphql", graphql_server.base_url), export_server.base_url.clone(), ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, })) .service(web::resource("/oai").route(web::get().to(oai_get))), ) @@ -1842,6 +2011,7 @@ mod tests { Some(DatestampGranularity::Day) ); assert!(decoded_filtered.scan_offset.is_some()); + assert_eq!(decoded_filtered.returned_count, Some(50)); let filtered_second_req = test::TestRequest::get() .uri(&format!( @@ -1894,4 +2064,293 @@ mod tests { export_server.stop().await; graphql_server.stop().await; } + + #[actix_web::test] + async fn filtered_resumption_cursor_tracks_returned_records() { + let works = make_descending_work_series(120); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-09-13&until=2024-12-31") + .to_request(); + let first_response = test::call_service(&app, first_req).await; + let first_body = String::from_utf8(test::read_body(first_response).await.to_vec()) + .expect("first page UTF-8"); + assert!(first_body.contains("")); + let first_token = extract_resumption_token(&first_body).expect("first token"); + + let second_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={first_token}" + )) + .to_request(); + let second_response = test::call_service(&app, second_req).await; + let second_body = String::from_utf8(test::read_body(second_response).await.to_vec()) + .expect("second page UTF-8"); + assert_eq!(count_occurrences(&second_body, "
    "), 50); + assert!(second_body.contains("")); + let second_token = extract_resumption_token(&second_body).expect("second token"); + let decoded_second = OaiService::decode_resumption_token(&second_token).unwrap(); + assert_eq!(decoded_second.returned_count, Some(100)); + + let third_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={second_token}" + )) + .to_request(); + let third_response = test::call_service(&app, third_req).await; + let third_body = String::from_utf8(test::read_body(third_response).await.to_vec()) + .expect("third page UTF-8"); + assert_eq!(count_occurrences(&third_body, "
    "), 10); + assert!(third_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn continuation_end_returns_terminal_token_without_no_records_match() { + let works = make_descending_work_series(120); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-11-12&until=2024-12-31") + .to_request(); + let first_response = test::call_service(&app, first_req).await; + let first_body = String::from_utf8(test::read_body(first_response).await.to_vec()) + .expect("first page UTF-8"); + assert_eq!(count_occurrences(&first_body, "
    "), 50); + assert!(!first_body.contains("")); + assert!(continuation_body.contains("")); + assert!(!continuation_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn gzip_accept_encoding_returns_compressed_oai_xml() { + let works = make_descending_work_series(1); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=Identify") + .insert_header((header::ACCEPT_ENCODING, "gzip")) + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!(response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + response + .headers() + .get(header::CONTENT_ENCODING) + .and_then(|value| value.to_str().ok()), + Some("gzip") + ); + + let compressed = test::read_body(response).await; + let mut decoder = GzDecoder::new(compressed.as_ref()); + let mut xml = String::new(); + decoder + .read_to_string(&mut xml) + .expect("gzip decode response"); + assert!(xml.contains("")); + assert!(xml.contains("gzip")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn transient_graphql_failures_return_503_with_retry_after() { + let graphql_server = + spawn_graphql_error_server(actix_web::http::StatusCode::SERVICE_UNAVAILABLE).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: 45, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=Identify") + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::SERVICE_UNAVAILABLE + ); + assert_eq!( + response + .headers() + .get(header::RETRY_AFTER) + .and_then(|value| value.to_str().ok()), + Some("45") + ); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn non_transient_graphql_failures_remain_http_500() { + let graphql_server = + spawn_graphql_error_server(actix_web::http::StatusCode::BAD_REQUEST).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=Identify") + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::INTERNAL_SERVER_ERROR + ); + assert!(response.headers().get(header::RETRY_AFTER).is_none()); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn transient_export_failures_return_503_with_retry_after() { + let work_id = Uuid::from_u128(21); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state.failing_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=GetRecord&identifier={}&metadataPrefix=marcxml", + OaiService::oai_identifier(work_id) + )) + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::SERVICE_UNAVAILABLE + ); + assert_eq!( + response + .headers() + .get(header::RETRY_AFTER) + .and_then(|value| value.to_str().ok()), + Some("30") + ); + + export_server.stop().await; + graphql_server.stop().await; + } } diff --git a/thoth-oai-server/src/service.rs b/thoth-oai-server/src/service.rs index f30af05f1..6e65884e8 100644 --- a/thoth-oai-server/src/service.rs +++ b/thoth-oai-server/src/service.rs @@ -47,6 +47,7 @@ pub(crate) struct ResumptionToken { pub until: Option, pub granularity: Option, pub scan_offset: Option, + pub returned_count: Option, } #[derive(Debug, Clone)] @@ -170,7 +171,9 @@ impl OaiService { .map(|set_record| vec![set_record.publisher_id]); let bounds = Self::build_datestamp_bounds(token)?; let date_filter_active = bounds.is_some(); - let cursor = token.scan_offset.unwrap_or(token.offset); + let filtering_active = bounds.is_some() || token.metadata_prefix == MetadataPrefix::MarcXml; + let scan_offset = token.scan_offset.unwrap_or(token.offset); + let returned_count = token.returned_count.unwrap_or(token.offset); let total = if token.metadata_prefix == MetadataPrefix::MarcXml { self.thoth_client @@ -183,27 +186,11 @@ impl OaiService { }; let mut records = Vec::new(); - let mut raw_offset = cursor; + let mut raw_offset = scan_offset; while raw_offset < total && records.len() < PAGE_LIMIT as usize { - let batch = if token.metadata_prefix == MetadataPrefix::MarcXml { - self.thoth_client - .get_oai_books( - publishers.clone(), - PAGE_LIMIT, - raw_offset, - Self::query_parameters(), - ) - .await? - } else { - self.thoth_client - .get_oai_works( - publishers.clone(), - PAGE_LIMIT, - raw_offset, - Self::query_parameters(), - ) - .await? - }; + let batch = self + .fetch_record_batch(token.metadata_prefix, publishers.clone(), raw_offset) + .await?; if batch.is_empty() { break; @@ -212,13 +199,7 @@ impl OaiService { raw_offset += batch.len() as i64; for work in batch { - if token.metadata_prefix == MetadataPrefix::MarcXml - && !Self::is_marcxml_record_candidate(&work) - { - continue; - } - if !Self::matches_datestamp_filter(work.updated_at_with_relations, bounds.as_ref())? - { + if !Self::matches_record(&work, token.metadata_prefix, bounds.as_ref())? { continue; } records.push(work); @@ -228,7 +209,24 @@ impl OaiService { } } - let next_token = (raw_offset < total && !records.is_empty()).then(|| { + let has_next_page = if raw_offset < total && !records.is_empty() { + if filtering_active { + self.has_more_matching_records( + token.metadata_prefix, + publishers.clone(), + raw_offset, + total, + bounds.as_ref(), + ) + .await? + } else { + true + } + } else { + false + }; + + let next_token = has_next_page.then(|| { Self::encode_resumption_token(ResumptionToken { offset: raw_offset, metadata_prefix: token.metadata_prefix, @@ -238,13 +236,14 @@ impl OaiService { until: token.until.clone(), granularity: token.granularity, scan_offset: Some(raw_offset), + returned_count: Some(returned_count + records.len() as i64), }) }); - let terminal_resumption_token = resumed && next_token.is_none() && !records.is_empty(); + let terminal_resumption_token = resumed && next_token.is_none(); Ok(RecordPage { records, - cursor, + cursor: returned_count, complete_list_size: (!date_filter_active).then_some(total), next_token, terminal_resumption_token, @@ -484,6 +483,61 @@ impl OaiService { } } + async fn fetch_record_batch( + &self, + metadata_prefix: MetadataPrefix, + publishers: Option>, + raw_offset: i64, + ) -> ThothResult> { + if metadata_prefix == MetadataPrefix::MarcXml { + self.thoth_client + .get_oai_books(publishers, PAGE_LIMIT, raw_offset, Self::query_parameters()) + .await + } else { + self.thoth_client + .get_oai_works(publishers, PAGE_LIMIT, raw_offset, Self::query_parameters()) + .await + } + } + + fn matches_record( + work: &Work, + metadata_prefix: MetadataPrefix, + bounds: Option<&DatestampBounds>, + ) -> ThothResult { + if metadata_prefix == MetadataPrefix::MarcXml && !Self::is_marcxml_record_candidate(work) { + return Ok(false); + } + Self::matches_datestamp_filter(work.updated_at_with_relations, bounds) + } + + async fn has_more_matching_records( + &self, + metadata_prefix: MetadataPrefix, + publishers: Option>, + mut raw_offset: i64, + total: i64, + bounds: Option<&DatestampBounds>, + ) -> ThothResult { + while raw_offset < total { + let batch = self + .fetch_record_batch(metadata_prefix, publishers.clone(), raw_offset) + .await?; + + if batch.is_empty() { + break; + } + + raw_offset += batch.len() as i64; + for work in batch { + if Self::matches_record(&work, metadata_prefix, bounds)? { + return Ok(true); + } + } + } + Ok(false) + } + fn build_datestamp_bounds(token: &ResumptionToken) -> ThothResult> { if token.from.is_none() && token.until.is_none() { return Ok(None); @@ -601,6 +655,7 @@ mod tests { until: Some("2024-12-31".to_string()), granularity: Some(DatestampGranularity::Day), scan_offset: Some(200), + returned_count: Some(75), }; let encoded = OaiService::encode_resumption_token(token.clone()); @@ -650,6 +705,7 @@ mod tests { let token = OaiService::decode_resumption_token(&legacy).unwrap(); assert_eq!(token.offset, 100); assert_eq!(token.scan_offset, None); + assert_eq!(token.returned_count, None); assert_eq!(token.from, None); assert_eq!(token.until, None); assert_eq!(token.granularity, None); From e9cce33df5c485a21df6f0ae28d886c5881ee6bb Mon Sep 17 00:00:00 2001 From: Javier Arias Date: Mon, 13 Apr 2026 17:45:50 +0100 Subject: [PATCH 04/19] Replace xls --- thoth-oai-server/assets/oai2.xsl | 1152 ++++++++++++++---------------- thoth-oai-server/src/lib.rs | 39 + 2 files changed, 584 insertions(+), 607 deletions(-) diff --git a/thoth-oai-server/assets/oai2.xsl b/thoth-oai-server/assets/oai2.xsl index 5f29642d8..f9e301b2d 100644 --- a/thoth-oai-server/assets/oai2.xsl +++ b/thoth-oai-server/assets/oai2.xsl @@ -1,707 +1,645 @@ - - - - - - - - - - - - - -td.value { - vertical-align: top; - padding-left: 1em; - padding: 3px; -} -td.key { - background-color: #e0e0ff; - padding: 3px; - text-align: right; - border: 1px solid #c0c0c0; - white-space: nowrap; - font-weight: bold; - vertical-align: top; -} -.dcdata td.key { - background-color: #ffffe0; -} -body { - margin: 1em 2em 1em 2em; -} -h1, h2, h3 { - font-family: sans-serif; - clear: left; -} -h1 { - padding-bottom: 4px; - margin-bottom: 0px; -} -h2 { - margin-bottom: 0.5em; -} -h3 { - margin-bottom: 0.3em; - font-size: medium; -} -.link { - border: 1px outset #88f; - background-color: #c0c0ff; - padding: 1px 4px 1px 4px; - font-size: 80%; - text-decoration: none; - font-weight: bold; - font-family: sans-serif; - color: black; -} -.link:hover { - color: red; -} -.link:active { - color: red; - border: 1px inset #88f; - background-color: #a0a0df; -} -.oaiRecord, .oaiRecordTitle { - background-color: #f0f0ff; - border-style: solid; - border-color: #d0d0d0; -} -h2.oaiRecordTitle { - background-color: #e0e0ff; - font-size: medium; - font-weight: bold; - padding: 10px; - border-width: 2px 2px 0px 2px; - margin: 0px; -} -.oaiRecord { - margin-bottom: 3em; - border-width: 2px; - padding: 10px; -} - -.results { - margin-bottom: 1.5em; -} -ul.quicklinks { - margin-top: 2px; - padding: 4px; - text-align: left; - border-bottom: 2px solid #ccc; - border-top: 2px solid #ccc; - clear: left; -} -ul.quicklinks li { - font-size: 80%; - display: inline; - list-stlye: none; - font-family: sans-serif; -} -p.intro { - font-size: 80%; -} - - - - + - Thoth OAI 2.0 - + + Thoth OAI-PMH Browser + -
    -

    Thoth OAI 2.0

    - -

    You are viewing an HTML version of the XML OAI response. To see the underlying XML use your web browsers view source option. More information about this XSLT is at the bottom of the page.

    -
    - -
    - -

    About the XSLT

    -

    An XSLT file has converted the OAI-PMH 2.0 responses into XHTML which looks nice in a browser which supports XSLT such as Mozilla, Firebird and Internet Explorer. The XSLT file was created by Christopher Gutteridge at the University of Southampton as part of the GNU EPrints system, and is freely redistributable under the GPL.

    If you want to use the XSL file on your own OAI interface you may but due to the way XSLT works you must install the XSL file on the same server as the OAI script, you can't just link to this copy.

    -
    +
    +
    + +
    +

    Thoth OAI-PMH Browser

    +

    Human-friendly view of OAI-PMH XML responses.

    + +
    +
    + +
    +

    Rendered by Thoth's OAI stylesheet. Use your browser's "View Source" to inspect raw XML.

    +
    +
    - + - - - - - - -
    Datestamp of response
    Request URL
    - +
    +

    Response Overview

    + + + + + + + +
    Response Date
    Request URL
    Verb + + + + + unknown + +
    +
    -

    OAI Error(s)

    -

    The request could not be completed due to the following error or errors.

    -
    - -
    +
    -

    Request was of type .

    -
    - - - - - - -
    + + + + + +
    - - - - - - - -
    Error Code
    -

    -
    - - - - - - - - - - - - - - - - - - -
    Repository Name
    Base URL
    Protocol Version
    Earliest Datestamp
    Deleted Record Policy
    Granularity
    - - -
    - - - Admin Email - - - - - - -

    Unsupported Description Type

    -

    The XSL currently does not support this type of description.

    -
    - -
    -
    - - - - - -

    OAI-Identifier

    - - - - - - - - - -
    Scheme
    Repository Identifier
    Delimiter
    Sample OAI Identifier
    -
    - - - - - -

    EPrints Description

    - -

    Content

    - -
    - -

    Submission Policy

    - -
    -

    Metadata Policy

    - -

    Data Policy

    - - -
    - - - -

    -
    - -
    -
    -
    - - -

    Comment

    -
    -
    - - - - - -

    Friends

    -
      - -
    -
    - - -
  • - -Identify
  • -
    - - - - - -

    Branding

    - - -
    - - -

    Icon

    - - - {br:title} - - - {br:title} - - + +
    +

    OAI Error:

    +

    +
    +
    + + +
    +

    Identify

    + + + + + + + + + + + + + +
    Repository Name
    Base URL
    Protocol Version
    Earliest Datestamp
    Deleted Record Policy
    Granularity
    Compression
    Admin Email
    + +
    - -

    Metadata Rendering Rule

    - - - - - - - + +

    OAI Identifier

    +
    URL
    Namespace
    Mime Type
    + + + +
    Scheme
    Repository Identifier
    Delimiter
    Sample Identifier
    - - - - - -

    Gateway Information

    - - - - - - - - - - - - - - + +

    Thoth Repository Metadata

    +
    Source
    Description
    URL
    Notes
    + + + + + +
    Latest Datestamp
    Rights Management
    Rights URL + + + + + Not provided + +
    - - Admin - + +

    Description (Additional Metadata)

    +
    + +
    - - - - +
    +

    GetRecord

    + +
    - - - - +
    +

    ListRecords

    + + +
    - - - - +
    +

    ListIdentifiers

    + + +
    - - - - +
    +

    ListSets

    + + +
    -

    Set

    - - - - -
    setName
    +
    +

    Set:

    +
    + + + +
    Set Spec
    Set Name
    + +
    +
    - + +

    Set Description

    +
    + +
    +
    - -
    -

    ListMetadataFormats

    - - -
    -
    +
    +

    ListMetadataFormats

    + +

    + Formats for identifier: + +

    +
    + +
    -

    Metadata Format

    - - - - - - - -
    metadataPrefix
    metadataNamespace
    schema
    -
    - - - - - - - - - -

    This is a list of metadata formats available for the record "". Use these links to view the metadata: - - - -

    -
    - -

    This is a list of metadata formats available from this archive.

    -
    -
    +
    +

    Metadata Format:

    +
    + + + + + + + +
    metadataPrefix + + ListRecords +
    metadataNamespace
    schema
    + +

    + + View this record in + +

    +
    +
    +
    - - -

    OAI Record:

    -
    - - - +
    +

    Record:

    +
    + + + +
    -

    OAI Record Header

    - - - - - - +

    Header

    +
    OAI Identifier - - oai_dc - oai_openaire - marcxml - formats -
    Datestamp
    + + + + + + + + + + + + + +
    Identifier + + + Formats +
    Datestamp
    Set Spec + + + Identifiers + + Records +
    Statusdeleted
    - -

    This record has been deleted.

    -
    -
    - - - -

    "about" part of record container not supported by the XSL

    -   - - - - - - - - - - setSpec - - Identifiers - Records - +

    Metadata

    + + + + + +

    No metadata payload for this record.

    +
    +
    - - - - - -

    There are more results.

    - - - - - - - - - - - -
    expirationDate
    completeListSize
    cursor
    resumptionToken: Resume
    + +

    About

    + + +
    + +
    +
    + +

    No additional about metadata.

    +
    +
    - - - -

    Unknown Metadata Format

    -
    - + +
    +

    Dublin Core (oai_dc)

    +
    + + +
    +
    - + + + + + + + + + + + + + + - -
    -

    MARC21 (marcxml)

    - - -
    + +
    +

    OpenAIRE (oai_openaire)

    +
    +
    + +
    +
    - - - -
    -

    OpenAIRE Metadata (oai_openaire)

    - - -
    + +
    +

    MARCXML (marcxml)

    +
    +
    + +
    +
    - - - -
    -

    Dublin Core Metadata (oai_dc)

    - - -
    + +
    +

    Metadata (Unsupported Format)

    +
    +
    + +
    +
    - -Title - - -Author or Creator - - -Subject and Keywords - - -Description - - -Publisher - - -Other Contributor - - -Date - - -Resource Type - - -Format - - -Resource Identifier - - -Source - - -Language - - -Relation - - + +
    +

    Resumption Token

    +
    - - URL - URL not shown as it is very long. + +

    More results are available.

    + + + + + + + + + + + + + + + +
    expirationDate
    completeListSize
    cursor
    token
    resume + Resume Listing +
    - +

    End of list. This empty token marks a terminal page.

    - - - - - - - - -Coverage - - -Rights Management - - - - -
    - <></> +
    - + +
    + <> + + </> +
    +
    - - ="" + + + ="" - -.xmlSource { - font-family: monospace; - line-height: 1.1rem; - border: solid #c0c0a0 1px; - background-color: #fff; - padding: 2em 2em 2em 0em; -} -.xmlBlock { - padding-left: 2em; -} -.xmlTagName { - color: #800000; - font-weight: bold; -} -.xmlAttrName { - font-weight: bold; -} -.xmlAttrValue { - color: #0000c0; -} + + - + + diff --git a/thoth-oai-server/src/lib.rs b/thoth-oai-server/src/lib.rs index 241bf9db9..c44172736 100644 --- a/thoth-oai-server/src/lib.rs +++ b/thoth-oai-server/src/lib.rs @@ -23,6 +23,8 @@ use uuid::Uuid; const LOG_FORMAT: &str = r#"%{r}a %a "%r" %s %b "%{Referer}i" "%{User-Agent}i" %T"#; const XSL_STYLESHEET: &str = include_str!("../assets/oai2.xsl"); +const METADATA_RIGHTS_STATEMENT: &str = "Metadata is licensed under the terms of Creative Commons CC0 1.0 Universal: https://creativecommons.org/publicdomain/zero/1.0/."; +const METADATA_RIGHTS_URI: &str = "https://creativecommons.org/publicdomain/zero/1.0/"; #[cfg(test)] const DEFAULT_RETRY_AFTER_SECONDS: u64 = 30; @@ -352,6 +354,8 @@ fn render_identify( \ \ {}\ +{}\ +{}\ \ \ ", @@ -362,6 +366,8 @@ fn render_identify( RECORD_PREFIX, SAMPLE_ID, xml_escape(&OaiService::timestamp_xml(latest)), + xml_escape(METADATA_RIGHTS_STATEMENT), + xml_escape(METADATA_RIGHTS_URI), ) } @@ -1585,6 +1591,12 @@ mod tests { if case == "verb=Identify" { assert!(get_body.contains("gzip")); assert!(post_body.contains("gzip")); + assert!(get_body.contains( + "Metadata is licensed under the terms of Creative Commons CC0 1.0 Universal: https://creativecommons.org/publicdomain/zero/1.0/." + )); + assert!(post_body.contains( + "https://creativecommons.org/publicdomain/zero/1.0/" + )); } assert_eq!( @@ -1597,6 +1609,33 @@ mod tests { graphql_server.stop().await; } + #[actix_web::test] + async fn stylesheet_contains_branding_and_oai_rendering_support() { + let app = test::init_service( + App::new().service(web::resource("/oai2.xsl").route(web::get().to(stylesheet))), + ) + .await; + + let req = test::TestRequest::get().uri("/oai2.xsl").to_request(); + let response = test::call_service(&app, req).await; + assert_eq!(response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()), + Some("text/xsl; charset=utf-8") + ); + + let body = String::from_utf8(test::read_body(response).await.to_vec()) + .expect("stylesheet body UTF-8"); + assert!(body.contains("https://cdn.thoth.pub/THOTH_ColourPos.png")); + assert!(body.contains("Rights Management")); + assert!(body.contains("match=\"oai:setDescription\"")); + assert!(body.contains("match=\"oai:about\"")); + assert!(body.contains("End of list. This empty token marks a terminal page.")); + } + #[actix_web::test] async fn repeated_arguments_return_bad_argument() { let graphql_server = From e8328242bb02f47dc2508e2487838b2d2dea8dd4 Mon Sep 17 00:00:00 2001 From: Javier Arias Date: Mon, 13 Apr 2026 17:51:39 +0100 Subject: [PATCH 05/19] Replace xls --- thoth-oai-server/assets/oai2.xsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/thoth-oai-server/assets/oai2.xsl b/thoth-oai-server/assets/oai2.xsl index f9e301b2d..b60a0f0a7 100644 --- a/thoth-oai-server/assets/oai2.xsl +++ b/thoth-oai-server/assets/oai2.xsl @@ -17,7 +17,7 @@ - Thoth OAI-PMH Browser + Thoth OAI-PMH