In this tutorial, you'll learn how to use Term's comprehensive format validators to ensure data quality and consistency. We'll explore validators for emails, URLs, credit cards, phone numbers, and more, understanding how to apply them effectively to your datasets.
- How to validate common data formats (email, URL, phone, etc.)
- How to set validation thresholds for partial compliance
- How to create custom format validators
- How to combine multiple format validations
Before starting, you should:
- Have Term installed and configured
- Understand basic Term validation concepts
- Have sample data with various format types
Format validation ensures data follows expected patterns. Term provides both built-in validators and custom pattern matching:
use term_guard::prelude::*;
// Built-in validators use optimized patterns
check.validates_email("email_column", 1.0) // 100% must be valid
// Custom validators use regex patterns
check.validates_regex("product_code", r"^PRD-\d{6}$", 0.95) // 95% thresholdLet's start by validating email addresses in a customer dataset:
use term_guard::prelude::*;
use datafusion::prelude::*;
#[tokio::main]
async fn main() -> Result<()> {
// Load customer data
let ctx = SessionContext::new();
ctx.register_csv(
"customers",
"data/customers.csv",
CsvReadOptions::default()
).await?;
// Create validation suite for emails
let suite = ValidationSuite::builder("customer_email_validation")
.add_check(
Check::builder("validate_primary_emails")
.description("Ensure all primary emails are valid")
.validates_email("primary_email", 1.0) // 100% must be valid
.build()
)
.add_check(
Check::builder("validate_secondary_emails")
.description("Check secondary emails (allowing some invalid)")
.validates_email("secondary_email", 0.95) // 95% threshold
.build()
)
.build();
// Run validation
let results = suite.run(&ctx).await?;
// Check results
for check_result in results.check_results() {
match check_result.status {
CheckStatus::Success => {
println!("✓ {}: All emails valid", check_result.check.name());
}
CheckStatus::Warning(msg) => {
println!("⚠ {}: {}", check_result.check.name(), msg);
}
CheckStatus::Error(msg) => {
println!("✗ {}: {}", check_result.check.name(), msg);
// Get specific validation metrics
if let Some(metrics) = check_result.constraint_results.first() {
println!(" Valid: {}/{} ({:.1}%)",
metrics.num_passed,
metrics.num_validated,
metrics.compliance_rate * 100.0);
}
}
}
}
Ok(())
}Validate URLs with different strictness levels:
async fn validate_web_links(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("web_link_validation")
.add_check(
Check::builder("product_urls")
.description("Validate product page URLs")
.validates_url_with_options(
"product_url",
0.99, // 99% must be valid
UrlValidationOptions {
require_protocol: true, // Must have http/https
require_tld: true, // Must have .com, .org, etc
allow_localhost: false, // No localhost URLs
allow_ip_address: false, // No IP addresses
allowed_protocols: vec!["https".to_string()], // HTTPS only
}
)
.build()
)
.add_check(
Check::builder("api_endpoints")
.description("Validate API endpoint URLs")
.validates_url_with_options(
"api_endpoint",
1.0,
UrlValidationOptions {
require_protocol: true,
require_tld: false, // Allow internal domains
allow_localhost: true, // Allow localhost for dev
allow_ip_address: true, // Allow IP addresses
allowed_protocols: vec!["http".to_string(), "https".to_string()],
}
)
.build()
)
.build();
let results = suite.run(ctx).await?;
results.print_summary();
Ok(())
}Validate credit cards and financial identifiers:
async fn validate_payment_data(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("payment_validation")
.add_check(
Check::builder("credit_card_numbers")
.description("Validate credit card numbers with Luhn algorithm")
.validates_credit_card("card_number", 1.0) // All must be valid
.build()
)
.add_check(
Check::builder("card_type_consistency")
.description("Ensure card numbers match their types")
.satisfies(
"CASE
WHEN card_type = 'VISA' THEN card_number LIKE '4%'
WHEN card_type = 'MASTERCARD' THEN
card_number LIKE '51%' OR
card_number LIKE '52%' OR
card_number LIKE '53%' OR
card_number LIKE '54%' OR
card_number LIKE '55%'
WHEN card_type = 'AMEX' THEN
card_number LIKE '34%' OR
card_number LIKE '37%'
ELSE true
END",
"Card numbers must match their type prefix",
1.0
)
.build()
)
.build();
let results = suite.run(ctx).await?;
Ok(())
}Validate phone numbers with regional options:
async fn validate_contact_numbers(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("phone_validation")
.add_check(
Check::builder("us_phone_numbers")
.description("Validate US phone numbers")
.validates_phone_with_options(
"phone",
0.98, // 98% threshold
PhoneValidationOptions {
country: Some("US".to_string()),
allow_international: false,
require_country_code: false,
formats: vec![
r"^\d{3}-\d{3}-\d{4}$".to_string(), // 555-123-4567
r"^\(\d{3}\) \d{3}-\d{4}$".to_string(), // (555) 123-4567
r"^\d{10}$".to_string(), // 5551234567
],
}
)
.build()
)
.add_check(
Check::builder("international_phones")
.description("Validate international phone numbers")
.validates_phone_with_options(
"intl_phone",
0.95,
PhoneValidationOptions {
country: None, // Any country
allow_international: true,
require_country_code: true,
formats: vec![], // Use default international formats
}
)
.build()
)
.build();
let results = suite.run(ctx).await?;
Ok(())
}Validate various types of identifiers:
async fn validate_identifiers(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("identifier_validation")
.add_check(
Check::builder("uuid_validation")
.description("Validate UUID formats")
.validates_uuid("transaction_id", 1.0)
.build()
)
.add_check(
Check::builder("ipv4_addresses")
.description("Validate IPv4 addresses")
.validates_ipv4("client_ip", 0.99)
.build()
)
.add_check(
Check::builder("ipv6_addresses")
.description("Validate IPv6 addresses")
.validates_ipv6("server_ipv6", 0.95)
.build()
)
.add_check(
Check::builder("postal_codes")
.description("Validate US postal codes")
.validates_postal_code("zip_code", "US", 0.99)
.build()
)
.build();
let results = suite.run(ctx).await?;
Ok(())
}Validate JSON and datetime formats:
async fn validate_structured_data(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("structured_data_validation")
.add_check(
Check::builder("json_payloads")
.description("Validate JSON request/response payloads")
.validates_json("request_body", 1.0)
.validates_json("response_body", 0.99) // Allow 1% errors
.build()
)
.add_check(
Check::builder("iso_timestamps")
.description("Validate ISO 8601 datetime formats")
.validates_iso8601_datetime("created_at", 1.0)
.validates_iso8601_datetime("updated_at", 1.0)
.build()
)
.build();
let results = suite.run(ctx).await?;
// Advanced: Parse and validate JSON structure
if results.is_success() {
validate_json_schema(ctx).await?;
}
Ok(())
}
async fn validate_json_schema(ctx: &SessionContext) -> Result<()> {
// Validate specific JSON structure
let suite = ValidationSuite::builder("json_schema_validation")
.add_check(
Check::builder("api_response_structure")
.satisfies(
r#"
json_extract_path_text(response_body, '$.status') IS NOT NULL
AND json_extract_path_text(response_body, '$.data') IS NOT NULL
AND json_extract_path_text(response_body, '$.timestamp') IS NOT NULL
"#,
"API responses must have status, data, and timestamp fields",
1.0
)
.build()
)
.build();
suite.run(ctx).await
}Create custom validators for domain-specific formats:
async fn validate_custom_formats(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("custom_format_validation")
.add_check(
Check::builder("product_codes")
.description("Validate product code format")
.validates_regex(
"product_code",
r"^[A-Z]{3}-\d{4}-[A-Z0-9]{2}$", // e.g., ABC-1234-X1
1.0
)
.build()
)
.add_check(
Check::builder("employee_ids")
.description("Validate employee ID format")
.validates_regex_with_options(
"employee_id",
r"^(EMP|CTR|TMP)\d{6}$", // EMP123456, CTR123456, TMP123456
0.99,
RegexValidationOptions {
case_sensitive: true,
multiline: false,
dot_matches_newline: false,
}
)
.build()
)
.add_check(
Check::builder("version_numbers")
.description("Validate semantic version numbers")
.validates_regex(
"version",
r"^\d+\.\d+\.\d+(-[a-zA-Z0-9]+)?$", // 1.2.3 or 1.2.3-beta
0.95
)
.build()
)
.build();
let results = suite.run(ctx).await?;
Ok(())
}Create comprehensive validation suites:
async fn comprehensive_validation(ctx: &SessionContext) -> Result<()> {
let suite = ValidationSuite::builder("user_data_validation")
.add_check(
Check::builder("user_contact_info")
.description("Validate all user contact information")
.is_complete("email") // No nulls
.validates_email("email", 1.0) // Valid emails
.is_unique("email") // No duplicates
.validates_phone("phone", 0.95) // Valid phone (95% threshold)
.has_length_between("username", 3, 20) // Username length
.validates_regex("username", r"^[a-zA-Z0-9_]+$", 1.0) // Alphanumeric
.build()
)
.add_check(
Check::builder("user_profile_urls")
.description("Validate user profile links")
.validates_url("website", 0.90) // Personal websites
.validates_url("linkedin_url", 0.95) // LinkedIn profiles
.validates_regex(
"twitter_handle",
r"^@[a-zA-Z0-9_]{1,15}$",
0.95
)
.build()
)
.build();
let results = suite.run(ctx).await?;
// Generate detailed report
generate_validation_report(&results);
Ok(())
}
fn generate_validation_report(results: &ValidationResult) {
println!("\n╔══════════════════════════════════════╗");
println!("║ Format Validation Report ║");
println!("╚══════════════════════════════════════╝");
for check_result in results.check_results() {
println!("\n► {}", check_result.check.name());
println!(" Status: {:?}", check_result.status);
for constraint_result in &check_result.constraint_results {
if let Some(name) = &constraint_result.constraint_name {
println!(" - {}: {:.1}% compliant ({}/{})",
name,
constraint_result.compliance_rate * 100.0,
constraint_result.num_passed,
constraint_result.num_validated);
}
}
}
}Now let's build a complete data quality pipeline with format validation:
async fn data_quality_pipeline(ctx: &SessionContext) -> Result<()> {
// Step 1: Profile the data
let profiler = ColumnProfiler::builder().build();
let email_profile = profiler.profile_column(ctx, "users", "email").await?;
// Step 2: Generate suggested validations
let suggestions = SuggestionEngine::new()
.add_rule(Box::new(FormatDetectionRule::new()))
.suggest_constraints(&email_profile);
// Step 3: Build validation suite from suggestions
let mut builder = ValidationSuite::builder("auto_generated_validations");
for suggestion in suggestions {
if suggestion.confidence > 0.8 { // High confidence suggestions
builder = builder.add_suggested_check(suggestion);
}
}
// Step 4: Add custom business rules
builder = builder.add_check(
Check::builder("business_rules")
.satisfies(
"email NOT LIKE '%@competitor.com'",
"No competitor email addresses",
1.0
)
.build()
);
// Step 5: Run validation
let suite = builder.build();
let results = suite.run(ctx).await?;
// Step 6: Store results for monitoring
let repository = InMemoryRepository::new();
repository.save(
ResultKey::new()
.with_timestamp(Utc::now())
.with_tag("pipeline", "format_validation"),
results.to_analyzer_context()
).await?;
Ok(())
}You've learned how to:
- ✅ Use built-in format validators for common data types
- ✅ Configure validation thresholds for partial compliance
- ✅ Create custom format validators with regex
- ✅ Combine multiple format validations
- ✅ Build comprehensive data quality pipelines
- Explore Format Validator Reference for all available validators
- Learn about Custom Constraint Development
- See Performance Optimization for large-scale validation
Q: Why are valid emails being marked as invalid? A: Check for leading/trailing whitespace. Use string trimming in your data pipeline.
Q: How do I handle international formats?
A: Use the _with_options variants to specify country codes and formats.
Q: Can I validate multiple formats in one column?
A: Use custom regex with alternation: (pattern1|pattern2|pattern3)
- Create a validator for social security numbers (with appropriate masking)
- Build a validator for IBAN (International Bank Account Numbers)
- Implement a validator for scientific notation numbers
- Create a comprehensive validation suite for an e-commerce dataset