Skip to content

Commit 52c845a

Browse files
ekscryptoclaude
andcommitted
Security: fix noncharacters, U+2065 gap, C1 Q-decode, escaped clusters, planes 4-13
- Reject Unicode noncharacters U+FDD0–U+FDEF, U+FFFE, U+FFFF (Unicode §23.7) - Close U+2065 gap between invisible format chars and bidi formatting chars - RFC2047 Q-decode: reject C1 control bytes 0x80–0x9F (not just DEL) - Quoted-string escaped position: require single-scalar ASCII cluster (RFC 5321) - Reject supplementary planes 4–13 (U+40000–U+DFFFF, entirely unassigned) - 122 tests, 0 failures Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 34f3aa4 commit 52c845a

4 files changed

Lines changed: 69 additions & 27 deletions

File tree

Sources/SwiftEmailValidator/EmailSyntaxValidator.swift

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -308,12 +308,18 @@ public final class EmailSyntaxValidator {
308308
.union(CharacterSet(charactersIn: Unicode.Scalar(0x2066)!...Unicode.Scalar(0x2069)!)) // LRI, RLI, FSI, PDI
309309
private static let deprecatedFormatChars: CharacterSet = CharacterSet(charactersIn: Unicode.Scalar(0x206A)!...Unicode.Scalar(0x206F)!) // Deprecated formatting
310310
private static let bmpPrivateUseChars: CharacterSet = CharacterSet(charactersIn: Unicode.Scalar(0xE000)!...Unicode.Scalar(0xF8FF)!) // BMP Private Use Area
311+
// Unicode permanently-reserved noncharacters — §23.7: "forbidden for use in open interchange."
312+
// U+FDD0–U+FDEF fall in nonAsciiBmpHigh (above bmpPrivateUseChars) and would survive all
313+
// other subtractions without this explicit exclusion.
314+
private static let unicodeNonCharacters: CharacterSet =
315+
CharacterSet(charactersIn: Unicode.Scalar(0xFDD0)!...Unicode.Scalar(0xFDEF)!) // U+FDD0–U+FDEF permanently reserved noncharacters
316+
.union(CharacterSet(charactersIn: Unicode.Scalar(0xFFFE)!...Unicode.Scalar(0xFFFF)!)) // U+FFFE, U+FFFF BMP noncharacters
311317
// Invisible and zero-width format characters that produce no visible glyph.
312318
// Allowing them enables creating visually-identical but distinct email addresses (spoofing).
313319
private static let zeroWidthAndInvisibleChars: CharacterSet =
314320
CharacterSet(charactersIn: Unicode.Scalar(0x00AD)!...Unicode.Scalar(0x00AD)!) // U+00AD Soft Hyphen
315321
.union(CharacterSet(charactersIn: Unicode.Scalar(0x200B)!...Unicode.Scalar(0x200D)!)) // U+200B ZWS, U+200C ZWNJ, U+200D ZWJ
316-
.union(CharacterSet(charactersIn: Unicode.Scalar(0x2060)!...Unicode.Scalar(0x2064)!)) // U+2060 Word Joiner, U+2061-U+2064 invisible math operators
322+
.union(CharacterSet(charactersIn: Unicode.Scalar(0x2060)!...Unicode.Scalar(0x2065)!)) // U+2060 Word Joiner, U+2061-U+2064 invisible math operators, U+2065 reserved
317323
.union(CharacterSet(charactersIn: Unicode.Scalar(0xFEFF)!...Unicode.Scalar(0xFEFF)!)) // U+FEFF BOM / Zero Width No-Break Space
318324
.union(CharacterSet(charactersIn: Unicode.Scalar(0x2028)!...Unicode.Scalar(0x2029)!)) // U+2028 Line Separator, U+2029 Paragraph Separator
319325
.union(CharacterSet(charactersIn: Unicode.Scalar(0xFE00)!...Unicode.Scalar(0xFE0F)!)) // U+FE00-U+FE0F Variation Selectors (invisible combiners, spoofing)
@@ -350,6 +356,7 @@ public final class EmailSyntaxValidator {
350356
.subtracting(deprecatedFormatChars) // Exclude deprecated format characters
351357
.subtracting(bmpPrivateUseChars) // Exclude BMP Private Use Area (U+E000-U+F8FF)
352358
.subtracting(zeroWidthAndInvisibleChars) // Exclude invisible format characters (spoofing prevention)
359+
.subtracting(unicodeNonCharacters) // Exclude permanently-reserved Unicode noncharacters (§23.7)
353360
.union(supplementaryPlanes) // Supplementary planes (emoji, etc.) - MUST BE LAST (after subtractions)
354361

355362
// RFC 952/1123: domain labels are LDH (letters, digits, hyphens); Unicode letters are
@@ -380,6 +387,7 @@ public final class EmailSyntaxValidator {
380387
.subtracting(deprecatedFormatChars) // Exclude deprecated format characters
381388
.subtracting(bmpPrivateUseChars) // Exclude BMP Private Use Area (U+E000-U+F8FF)
382389
.subtracting(zeroWidthAndInvisibleChars) // Exclude invisible format characters (spoofing prevention)
390+
.subtracting(unicodeNonCharacters) // Exclude permanently-reserved Unicode noncharacters (§23.7)
383391
.union(supplementaryPlanes) // Supplementary planes (emoji, etc.) - MUST BE LAST (after subtractions)
384392

385393
private static func extractDotAtom(_ candidate: String, compatibility: Compatibility) -> String? {
@@ -404,10 +412,12 @@ public final class EmailSyntaxValidator {
404412
// Reject supplementary-plane ranges excluded from allowedCharacterSet via
405413
// explicit scalar guards (Foundation CharacterSet.contains() is reliable for
406414
// individual scalars, but belt-and-suspenders for these security-sensitive ranges):
415+
// U+40000-U+DFFFF: Planes 4-13 (entirely unassigned in Unicode)
407416
// U+E0000-U+EFFFF: entire SSP (Tags block, unassigned gaps, VS Supplement)
408417
// U+F0000-U+10FFFF: Supplementary PUA-A/B
409418
&& !label.unicodeScalars.contains(where: {
410-
($0.value >= 0xE0000 && $0.value <= 0x10FFFF) // Entire SSP + PUA-A/B
419+
($0.value >= 0x40000 && $0.value <= 0xDFFFF) // Planes 4-13 (entirely unassigned)
420+
|| ($0.value >= 0xE0000 && $0.value <= 0x10FFFF) // Entire SSP + PUA-A/B
411421
})
412422
})
413423
else {
@@ -449,10 +459,13 @@ public final class EmailSyntaxValidator {
449459
guard !character.unicodeScalars.contains(where: { s in
450460
s.value == 0x00AD || // U+00AD Soft Hyphen
451461
(s.value >= 0x200B && s.value <= 0x200D) || // U+200B-U+200D ZWS/ZWNJ/ZWJ
452-
(s.value >= 0x2060 && s.value <= 0x2064) || // U+2060-U+2064 invisible format chars
462+
(s.value >= 0x2060 && s.value <= 0x2065) || // U+2060-U+2065 invisible/reserved format chars
453463
s.value == 0xFEFF || // U+FEFF BOM
454464
(s.value >= 0xFE00 && s.value <= 0xFE0F) || // U+FE00-U+FE0F Variation Selectors
455465
(s.value == 0x2028 || s.value == 0x2029) || // U+2028 Line Sep, U+2029 Para Sep
466+
(s.value >= 0xFDD0 && s.value <= 0xFDEF) || // U+FDD0-U+FDEF Unicode noncharacters
467+
(s.value == 0xFFFE || s.value == 0xFFFF) || // U+FFFE/U+FFFF BMP noncharacters
468+
(s.value >= 0x40000 && s.value <= 0xDFFFF) || // Planes 4-13 (entirely unassigned)
456469
(s.value >= 0xE0000 && s.value <= 0x10FFFF) // Entire SSP (Tags, unassigned gaps, VS Sup) + PUA-A/B
457470
}) else {
458471
return nil
@@ -470,7 +483,11 @@ public final class EmailSyntaxValidator {
470483

471484
if escaped {
472485
cleanedText.append(character)
473-
guard quotedPairSMTP.contains(characterScalar) else {
486+
// RFC 5321: quoted-pair = "\" (VCHAR / WSP) — exactly one printable ASCII scalar.
487+
// A multi-scalar grapheme cluster (e.g. e + U+0301 combining acute) would have its
488+
// first scalar pass quotedPairSMTP while the additional scalars go unchecked.
489+
guard character.unicodeScalars.count == 1,
490+
quotedPairSMTP.contains(characterScalar) else {
474491
return nil
475492
}
476493
escaped = false

Sources/SwiftEmailValidator/RFC2047Coder.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,10 @@ public final class RFC2047Coder {
137137
digitsCaptured += 1
138138
if digitsCaptured == 1 { continue nextCharacter }
139139

140-
guard value >= 0x20 && value != 0x7F,
140+
// Reject C0 controls (0x00–0x1F), DEL (0x7F), and C1 controls (0x80–0x9F).
141+
// The prior guard (>= 0x20 && != 0x7F) inadvertently admitted 0x80–0x9F, which
142+
// String(data:encoding:isoLatin1) maps to U+0080–U+009F (C1 control characters).
143+
guard (value >= 0x20 && value < 0x7F) || value >= 0xA0,
141144
let decodedCharacter = String(data: Data([value]), encoding: stringEncoding)
142145
else {
143146
return nil

Tests/SwiftEmailValidatorTests/EmailSyntaxValidatorTests.swift

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,17 +1030,12 @@ final class EmailSyntaxValidatorTests: XCTestCase {
10301030
}
10311031
}
10321032

1033-
// MARK: - Review: Tests documenting bugs (will fail until code is fixed)
1033+
// MARK: - Unicode noncharacter and reserved codepoint exclusions
10341034

10351035
func testUnicodeNonCharactersRejectedInLocalPart() {
10361036
// Unicode permanently-reserved noncharacters (U+FDD0–U+FDEF, U+FFFE, U+FFFF)
10371037
// have no defined semantics and per Unicode §23.7 are "forbidden for use in
1038-
// open interchange of Unicode text data."
1039-
// Bug: these fall in nonAsciiBmpHigh (0xE000–0xFFFF) and survive all
1040-
// .subtracting() calls — bmpPrivateUseChars only covers up to U+F8FF,
1041-
// and zeroWidthAndInvisibleChars does not list this range.
1042-
// The explicit scalar guard in extractDotAtom only starts at 0xE0000 (SSP),
1043-
// so U+FDD0–U+FFFF are currently accepted.
1038+
// open interchange of Unicode text data." Both local-part formats must reject them.
10441039
let permissive: (String) -> Bool = { _ in true }
10451040
let nonCharacters: [(Unicode.Scalar, String)] = [
10461041
(Unicode.Scalar(0xFDD0)!, "U+FDD0 (first noncharacter in FDD0–FDEF range)"),
@@ -1064,11 +1059,8 @@ final class EmailSyntaxValidatorTests: XCTestCase {
10641059
}
10651060

10661061
func testReservedFormatCharU2065RejectedInLocalPart() {
1067-
// U+2065 is an unassigned/reserved code point that sits in the gap between
1068-
// zeroWidthAndInvisibleChars (ends at U+2064) and bidiFormattingChars (starts at U+2066).
1069-
// It should be excluded like its immediate neighbours U+2064 and U+2066.
1070-
// Bug: neither the CharacterSet nor the inline scalar guard in extractQuotedString
1071-
// covers this single code point.
1062+
// U+2065 is unassigned/reserved and sits between invisible format chars (U+2060–U+2064)
1063+
// and bidi formatting chars (U+2066–U+2069). It must be excluded like its neighbours.
10721064
let permissive: (String) -> Bool = { _ in true }
10731065
XCTAssertNil(
10741066
EmailSyntaxValidator.mailbox(from: "user\u{2065}@site.com",
@@ -1095,10 +1087,8 @@ final class EmailSyntaxValidatorTests: XCTestCase {
10951087

10961088
func testEscapedMultiScalarClusterRejectedInUnicodeQuotedString() {
10971089
// RFC 5321 §3.3: quoted-pair = "\" (VCHAR / WSP) — exactly one printable ASCII character.
1098-
// Bug: in Unicode mode the escape path checks only characterScalar (first scalar) against
1099-
// quotedPairSMTP (0x20–0x7E). A grapheme cluster with a non-ASCII combining scalar
1100-
// (e.g., e + U+0301 combining acute = decomposed "é", 2 scalars) passes because the
1101-
// first scalar 'e' is in the ASCII-printable range.
1090+
// A multi-scalar grapheme cluster in an escape position must be rejected even if its
1091+
// first scalar is ASCII-printable (e.g. e + U+0301 combining acute = 2 scalars).
11021092
let permissive: (String) -> Bool = { _ in true }
11031093
// "\" followed by e+U+0301 (two-scalar grapheme cluster) in a quoted local part
11041094
let twoScalarEscaped = "\"\\" + "e\u{0301}" + "\"@site.com"
@@ -1114,4 +1104,39 @@ final class EmailSyntaxValidatorTests: XCTestCase {
11141104
"Escaped single ASCII scalar must still be accepted"
11151105
)
11161106
}
1107+
1108+
func testSupplementaryPlanes4Through13RejectedInLocalPart() {
1109+
// Planes 4–13 (U+40000–U+DFFFF) are entirely unassigned in Unicode and must be rejected.
1110+
let permissive: (String) -> Bool = { _ in true }
1111+
let planeProbes: [(Unicode.Scalar, String)] = [
1112+
(Unicode.Scalar(0x40000)!, "U+40000 (first scalar of Plane 4, unassigned)"),
1113+
(Unicode.Scalar(0x7FFFF)!, "U+7FFFF (last scalar of Plane 7, unassigned)"),
1114+
(Unicode.Scalar(0x80000)!, "U+80000 (first scalar of Plane 8, unassigned)"),
1115+
(Unicode.Scalar(0xDFFFF)!, "U+DFFFF (last scalar of Plane 13, unassigned)"),
1116+
]
1117+
for (scalar, name) in planeProbes {
1118+
let char = String(scalar)
1119+
XCTAssertNil(
1120+
EmailSyntaxValidator.mailbox(from: "user\(char)@site.com",
1121+
compatibility: .unicode, domainValidator: permissive),
1122+
"\(name) must be rejected in dot-atom local part"
1123+
)
1124+
XCTAssertNil(
1125+
EmailSyntaxValidator.mailbox(from: "\"user\(char)\"@site.com",
1126+
compatibility: .unicode, domainValidator: permissive),
1127+
"\(name) must be rejected in quoted-string local part"
1128+
)
1129+
}
1130+
// SMP (Plane 1) characters must remain accepted — the fix must not over-reach
1131+
XCTAssertNotNil(
1132+
EmailSyntaxValidator.mailbox(from: "user\u{1F600}@site.com",
1133+
compatibility: .unicode, domainValidator: permissive),
1134+
"U+1F600 (emoji, SMP Plane 1) must still be accepted"
1135+
)
1136+
XCTAssertNotNil(
1137+
EmailSyntaxValidator.mailbox(from: "user\u{3FFFF}@site.com",
1138+
compatibility: .unicode, domainValidator: permissive),
1139+
"U+3FFFF (last scalar of Plane 3 / TIP, assigned range boundary) must still be accepted"
1140+
)
1141+
}
11171142
}

Tests/SwiftEmailValidatorTests/RFC2047CoderTests.swift

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -307,14 +307,11 @@ final class RFC2047CoderTests: XCTestCase {
307307
"Clean base64 encoded word should decode correctly")
308308
}
309309

310-
// MARK: - Review: Test documenting a bug (will fail until code is fixed)
310+
// MARK: - C1 control byte rejection in Q-encoded ISO-8859-1/2
311311

312312
func testDecodingLatin1QC1ControlBytesRejected() {
313-
// RFC 2047 Q-encoding for ISO-8859-1/2 must reject C1 control bytes (0x80–0x9F).
314-
// Bug: the current guard (value >= 0x20 && value != 0x7F) lets bytes 0x80–0x9F through.
315-
// String(data: Data([0x80]), encoding: .isoLatin1) succeeds and returns U+0080 (C1 control),
316-
// so decode() returns a non-nil string containing a C1 character instead of nil.
317-
// The fix is: (value >= 0x20 && value < 0x7F) || value >= 0xA0
313+
// C1 control bytes (0x80–0x9F) must be rejected from Q-encoded ISO-8859-1/2 words.
314+
// They are not valid interchange characters and must not decode to C1 Unicode scalars.
318315
XCTAssertNil(RFC2047Coder.decode("=?iso-8859-1?q?=80?="),
319316
"0x80 (first C1 control) must be rejected from Q-encoded ISO-8859-1")
320317
XCTAssertNil(RFC2047Coder.decode("=?iso-8859-1?q?=90?="),

0 commit comments

Comments
 (0)