-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchecks.yaml
More file actions
155 lines (134 loc) · 5.59 KB
/
checks.yaml
File metadata and controls
155 lines (134 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
version: "1"
rules:
# https://clickhouse.com/docs/getting-started/example-datasets/nyc-taxi
- dataset: ch@[nyc_taxi.trips_small]
where: "pickup_datetime > '2014-01-01'"
checks:
# schema-level checks
- schema_check:
expect_columns_ordered:
columns_order: [trip_id, pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude,
dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra,
tip_amount, tolls_amount, total_amount, payment_type, pickup_ntaname, dropoff_ntaname]
desc: "Ensure table columns are in the expected order"
on_fail: error
- schema_check:
expect_columns:
columns: [trip_id, fare_amount]
desc: "Ensure required columns exist"
on_fail: error
- schema_check:
columns_not_present:
columns: [credit_card_number, credit_card_cvv]
pattern: "pii_*"
desc: "Ensure PII and credit card info is not present in the table"
on_fail: error
# table-level checks
- row_count between 10000 and 3500000:
desc: "Dataset should contain a reasonable number of trips"
on_fail: error
# column existence and nullability
- not_null(trip_id):
desc: "Trip ID is mandatory"
- not_null(pickup_datetime)
- not_null(dropoff_datetime)
# data freshness
- freshness(pickup_datetime) < 7d:
desc: "Data should be no older than 7 days"
on_fail: warn
# uniqueness constraints
- uniqueness(trip_id):
desc: "Trip IDs must be unique"
on_fail: error
# numeric validations
- min(trip_distance) >= 0:
desc: "Trip distance cannot be negative"
- max(trip_distance) < 1000:
desc: "Maximum trip distance seems unrealistic"
on_fail: warn
- avg(trip_distance) between 1.0 and 20.0:
desc: "Average trip distance should be reasonable"
- stddev(trip_distance) < 100:
desc: "Trip distance variation should be within normal range"
# fare validations
- min(fare_amount) > 0:
desc: "Fare amount should be positive"
- max(fare_amount) < 1000:
desc: "Maximum fare seems too high"
- sum(fare_amount) between 10000 and 10000000:
desc: "Total fare amount should be within expected range"
# custom validation with raw query
- raw_query:
desc: "Check for trips with zero distance but positive fare"
query: "select count() from {{dataset}} where trip_distance = 0 and fare_amount > 0"
on_fail: warn
# https://wiki.postgresql.org/wiki/Sample_Databases
- dataset: pg@[public.land_registry_price_paid_uk]
where: "transfer_date >= '2025-01-01'"
checks:
# schema validation
- schema_check:
expect_columns_ordered:
columns_order: [transaction, price, transfer_date, postcode, property_type, newly_built, duration, paon, saon,
street, locality, city, district, county, ppd_category_type, record_status]
desc: "Validate expected column order for data consistency"
on_fail: warn
- schema_check:
expect_columns:
columns: [transaction, price, property_type]
desc: "Ensure critical columns exist"
on_fail: error
- row_count() between 100 and 250000:
desc: "Recent property transactions should be within expected volume"
# price checks
- not_null(price):
desc: "Property price is mandatory"
- min(price) >= 100:
desc: "Minimum price should be realistic"
- max(price) < 50000000:
desc: "Maximum price should be within UK market range"
- avg(price) between 200000 and 800000:
desc: "Average property price should align with market data"
- stddev(price) < 500000:
desc: "Price standard deviation should indicate reasonable market variation"
# property type validations
- not_null(property_type)
- uniqueness(transaction):
desc: "Each transaction must have a unique identifier"
on_fail: error
# date validations
- freshness(transfer_date) < 1d:
desc: "Transfer date should be very recent"
on_fail: warn
# # https://github.com/datacharmer/test_db
- dataset: mysql@[employees.salaries]
checks:
# schema validation
- schema_check:
expect_columns_ordered:
columns_order: [order_id, customer_id, order_status, total_amount, item_count, created_at, shipped_date]
desc: "Ensure order table maintains expected column structure"
on_fail: error
- schema_check:
expect_columns:
columns: [order_id, customer_id, order_status, total_amount]
desc: "Ensure essential order columns exist"
on_fail: error
# order volume validation
- row_count between 100 and 10000:
desc: "Monthly order volume should be within business expectations"
on_fail: warn
# customer data integrity
- not_null(customer_id):
desc: "Every order must have a customer ID"
on_fail: error
- not_null(order_status)
# order value validations
- min(total_amount) > 0:
desc: "Order total must be positive"
on_fail: error
- max(total_amount) < 10000:
desc: "Unusually high order amount detected"
on_fail: warn
- avg(total_amount) between 25.0 and 200.0:
desc: "Average order value should align with business metrics"