diff --git a/docs/file-format.md b/docs/file-format.md index 1b70d129..79d9e847 100644 --- a/docs/file-format.md +++ b/docs/file-format.md @@ -537,9 +537,13 @@ ampersands, and hyphens. Numeric variables can express a single number or a range and contain only integers, but may contain negative numbers. Numeric variables can have a non-numeric prefix and suffix. ```yaml -page-range: S10-15 +page-range: S10-15 # Page S10 to 15 ``` +Note that if you specify a number with a numeric affix, the whole variable will +be interpreted as a [string](#string) instead. This improves the style for +atypical page numbers like `11E201`. + #### Unicode Language Identifier A [Unicode Language Identifier](https://unicode.org/reports/tr35/tr35.html#unicode_language_id) identifies a language or its variants. At the simplest, you can specify an all-lowercase [two-letter ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) like `en` or `es` as a language. It is possible to specify regions, scripts, or variants to more precisely identify a variety of a language, especially in cases where the ISO 639-1 code is considered a "macrolanguage" (`zh` includes both Cantonese and Mandarin). In such cases, specify values like `en-US` for American English or `zh-Hans-CN` for Mandarin written in simplified script in mainland China. The region tags have to be written in all-caps and are mostly corresponding to [ISO 3166-1 alpha_2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements) codes. diff --git a/src/csl/rendering/mod.rs b/src/csl/rendering/mod.rs index 63fe4e64..6bca3bfa 100644 --- a/src/csl/rendering/mod.rs +++ b/src/csl/rendering/mod.rs @@ -1132,6 +1132,10 @@ impl Iterator for BranchConditionIter<'_, '_, T> { self.ctx.resolve_number_variable(var), Some(NumberVariableResult::Regular(MaybeTyped::Typed(_))) ), + Variable::Page(var) => matches!( + self.ctx.resolve_page_variable(var), + Some(MaybeTyped::Typed(_)) + ), _ => false, }) } else { diff --git a/src/types/mod.rs b/src/types/mod.rs index c552bb4c..b96e40a8 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -552,6 +552,12 @@ mod tests { assert!(val.suffix.is_none()); assert_eq!(&val.to_string(), "-5"); + let val = Numeric::from_str("01").unwrap(); + assert!(val.value == NumericValue::Number(1)); + assert_eq!(val.prefix_str(), Some("0")); + assert!(val.suffix.is_none()); + assert_eq!(&val.to_string(), "01"); + let val = Numeric::from_str("1st").unwrap(); assert!(val.value == NumericValue::Number(1)); assert!(val.prefix.is_none()); @@ -606,6 +612,34 @@ mod tests { assert!(Numeric::from_str("2nd edition").is_err()); } + #[test] + fn test_preserve_space_separator() { + // https://github.com/typst/hayagriva/issues/312 + // https://github.com/typst/hayagriva/issues/440 + let serial_numbers = + &["ISO/IEC 23009-1:2022(E)", "GB/T 7714—2025", "GB/T 7714", "第 6 册"]; + for s in serial_numbers { + let val: MaybeTyped = MaybeTyped::infallible_from_str(s); + // It can be either typed or string, as long as whitespaces between + // affixes and numbers are preserved. + assert_eq!(val.to_string(), *s); + } + + // For GB standards, em dash is the recommended separator, but + // hyphen-minus and en dash should also be supported. + let dashes = ["-", "–", "—"]; + for dash in dashes { + let s = format!("GB/T 7714{dash}2015"); + let val: MaybeTyped = MaybeTyped::infallible_from_str(&s); + assert_eq!( + val.to_string() + .replace(dashes[0], dashes[2]) + .replace(dashes[1], dashes[2]), + format!("GB/T 7714{}2015", dashes[2]) + ); + } + } + #[test] #[cfg(feature = "biblatex")] fn test_issue_227() { diff --git a/src/types/numeric.rs b/src/types/numeric.rs index 261f65ff..ac509323 100644 --- a/src/types/numeric.rs +++ b/src/types/numeric.rs @@ -246,11 +246,22 @@ impl FromStr for Numeric { fn from_str(value: &str) -> Result { let mut s = Scanner::new(value); - let prefix = - s.eat_while(|c: char| !c.is_numeric() && !c.is_whitespace() && c != '-'); + s.eat_whitespace(); + + let prefix = { + // Eat non-numeric characters and leading zeros. + let start = s.cursor(); + s.eat_while(|c: char| !c.is_numeric() && c != '-'); + let zeros = s.eat_while('0'); + if !zeros.is_empty() && s.peek().is_none_or(|c| !c.is_numeric()) { + // Uneat the last zero if the value is just zero. + s.uneat(); + } + s.from(start) + }; let value = number(&mut s).ok_or(NumericError::NoNumber)?; - s.eat_whitespace(); + let space_after_value = s.eat_whitespace(); let value = match s.peek() { Some(c) if is_delimiter(c) => { @@ -258,6 +269,7 @@ impl FromStr for Numeric { s.eat_until(|c: char| !is_delimiter(c)); let mut items = vec![(value, Some(NumericDelimiter::try_from(c)?))]; loop { + s.eat_whitespace(); let num = number(&mut s).ok_or(NumericError::NoNumber)?; s.eat_whitespace(); match NumericDelimiter::from_str(s.eat_while(is_delimiter)) { @@ -276,7 +288,7 @@ impl FromStr for Numeric { _ => NumericValue::Number(value), }; s.eat_whitespace(); - let post = s.eat_while(|c: char| !c.is_whitespace()); + let post = s.eat_while(|c: char| !c.is_numeric() && !c.is_whitespace()); if !s.after().is_empty() { return Err(NumericError::UnexpectedCharactersAfterPostfix); @@ -289,7 +301,11 @@ impl FromStr for Numeric { } else { Some(Box::new(prefix.to_string())) }, - suffix: if post.is_empty() { None } else { Some(Box::new(post.to_string())) }, + suffix: if post.is_empty() { + None + } else { + Some(Box::new(format!("{space_after_value}{post}"))) + }, }) } } @@ -324,8 +340,11 @@ pub enum NumericError { MissingDelimiter, } +/// Eat a number from the scanner, assuming leading whitespaces and zeros have +/// already been eaten. +/// +/// The number can be positive, negative, or zero. fn number(s: &mut Scanner) -> Option { - s.eat_whitespace(); let negative = s.eat_if('-'); let num = s.eat_while(|c: char| c.is_numeric()); if num.is_empty() { diff --git a/src/types/page.rs b/src/types/page.rs index 6251e053..4d205b45 100644 --- a/src/types/page.rs +++ b/src/types/page.rs @@ -392,6 +392,8 @@ where #[cfg(test)] mod test { + use super::*; + #[test] fn group_by() { fn group(s: &str) -> Vec<&'_ str> { @@ -410,4 +412,29 @@ mod test { assert_eq!(["–a", ","], group("–a,").as_slice()); assert_eq!(["a–", ",", "–b"], group("a–,–b").as_slice()); } + + #[test] + fn nonnumeric_page() { + // https://github.com/typst/hayagriva/issues/170 + for s in &["11E201", "1.36"] { + let n: MaybeTyped = MaybeTyped::infallible_from_str(s); + assert_eq!(n, MaybeTyped::String(s.to_string())); + } + + // Page ranges should still be parsed as numeric values. + assert_eq!( + MaybeTyped::::infallible_from_str("S10-15"), + MaybeTyped::Typed(PageRanges::new(vec![PageRangesPart::Range( + Numeric::from_str("S10").unwrap(), + Numeric::from_str("15").unwrap(), + )])) + ); + assert_eq!( + MaybeTyped::::infallible_from_str("011-012"), + MaybeTyped::Typed(PageRanges::new(vec![PageRangesPart::Range( + Numeric::from_str("011").unwrap(), + Numeric::from_str("012").unwrap(), + )])) + ); + } } diff --git a/tests/citeproc-pass.txt b/tests/citeproc-pass.txt index e7b60830..5e366387 100644 --- a/tests/citeproc-pass.txt +++ b/tests/citeproc-pass.txt @@ -409,7 +409,9 @@ nameorder_ShortNameAsSortDemoteNever namespaces_NonNada3 number_FailingDelimiters number_IsNumericWithAlpha +number_LeadingZeros number_MixedPageRange +number_MixedText number_PageFirst number_PageRange number_SimpleNumberArabic