|
| 1 | +import Foundation |
| 2 | + |
| 3 | +/// Lightweight HTML stripping and entity decoding for plain-text contexts. |
| 4 | +/// Full HTML rendering uses WKWebView in the detail screen. |
| 5 | +extension String { |
| 6 | + |
| 7 | + /// Decode HTML character entities (& < ' ' etc.). |
| 8 | + func strippingHTMLEntities() -> String { |
| 9 | + var result = self |
| 10 | + // Named entities |
| 11 | + let namedEntities: [String: String] = [ |
| 12 | + "&": "&", "<": "<", ">": ">", |
| 13 | + """: "\"", "'": "'", " ": "\u{00A0}", |
| 14 | + "–": "\u{2013}", "—": "\u{2014}", |
| 15 | + "«": "\u{00AB}", "»": "\u{00BB}", |
| 16 | + "•": "\u{2022}", "…": "\u{2026}", |
| 17 | + "©": "\u{00A9}", "®": "\u{00AE}", |
| 18 | + "™": "\u{2122}", "€": "\u{20AC}", |
| 19 | + ] |
| 20 | + for (entity, char) in namedEntities { |
| 21 | + result = result.replacingOccurrences(of: entity, with: char) |
| 22 | + } |
| 23 | + // Decimal numeric entities: { |
| 24 | + if let regex = try? NSRegularExpression(pattern: "&#(\\d+);") { |
| 25 | + let range = NSRange(result.startIndex..., in: result) |
| 26 | + let matches = regex.matches(in: result, range: range).reversed() |
| 27 | + for match in matches { |
| 28 | + if let codeRange = Range(match.range(at: 1), in: result), |
| 29 | + let codePoint = UInt32(result[codeRange]), |
| 30 | + let scalar = Unicode.Scalar(codePoint) { |
| 31 | + let fullRange = Range(match.range, in: result)! |
| 32 | + result.replaceSubrange(fullRange, with: String(Character(scalar))) |
| 33 | + } |
| 34 | + } |
| 35 | + } |
| 36 | + // Hex numeric entities: 💩 |
| 37 | + if let regex = try? NSRegularExpression(pattern: "&#x([0-9a-fA-F]+);") { |
| 38 | + let range = NSRange(result.startIndex..., in: result) |
| 39 | + let matches = regex.matches(in: result, range: range).reversed() |
| 40 | + for match in matches { |
| 41 | + if let codeRange = Range(match.range(at: 1), in: result), |
| 42 | + let codePoint = UInt32(result[codeRange], radix: 16), |
| 43 | + let scalar = Unicode.Scalar(codePoint) { |
| 44 | + let fullRange = Range(match.range, in: result)! |
| 45 | + result.replaceSubrange(fullRange, with: String(Character(scalar))) |
| 46 | + } |
| 47 | + } |
| 48 | + } |
| 49 | + return result |
| 50 | + } |
| 51 | + |
| 52 | + /// Strip all HTML tags and decode entities, returning plain text. |
| 53 | + /// Block-level tags are replaced with newlines. |
| 54 | + func strippingHTML() -> String { |
| 55 | + guard !isEmpty else { return "" } |
| 56 | + var text = self |
| 57 | + // Replace <br> with newline |
| 58 | + text = text.replacingOccurrences( |
| 59 | + of: "<br\\s*/?>", |
| 60 | + with: "\n", |
| 61 | + options: [.regularExpression, .caseInsensitive] |
| 62 | + ) |
| 63 | + // Replace block-level closing/opening tags with newline |
| 64 | + text = text.replacingOccurrences( |
| 65 | + of: "</?(p|div|li|tr|h[1-6])[^>]*>", |
| 66 | + with: "\n", |
| 67 | + options: [.regularExpression, .caseInsensitive] |
| 68 | + ) |
| 69 | + // Strip remaining tags |
| 70 | + text = text.replacingOccurrences( |
| 71 | + of: "<[^>]*>", |
| 72 | + with: "", |
| 73 | + options: .regularExpression |
| 74 | + ) |
| 75 | + // Decode entities |
| 76 | + text = text.strippingHTMLEntities() |
| 77 | + // Collapse whitespace |
| 78 | + text = text.components(separatedBy: .newlines) |
| 79 | + .map { $0.trimmingCharacters(in: .whitespaces) |
| 80 | + .replacingOccurrences(of: "\\s{2,}", with: " ", options: .regularExpression) } |
| 81 | + .joined(separator: "\n") |
| 82 | + // Collapse multiple blank lines |
| 83 | + text = text.replacingOccurrences(of: "\n{3,}", with: "\n\n", options: .regularExpression) |
| 84 | + return text.trimmingCharacters(in: .whitespacesAndNewlines) |
| 85 | + } |
| 86 | +} |
| 87 | + |
0 commit comments