@@ -292,14 +292,97 @@ def test_execute_delete_previous_runs_rows(
292292 hive_stats_collection_operator = HiveStatsCollectionOperator (** self .kwargs )
293293 hive_stats_collection_operator .execute (context = {})
294294
295- sql = f """
295+ expected_sql = """
296296 DELETE FROM hive_stats
297297 WHERE
298- table_name=' { hive_stats_collection_operator . table } ' AND
299- partition_repr=' { mock_json_dumps . return_value } ' AND
300- dttm=' { hive_stats_collection_operator . dttm } ' ;
298+ table_name = %s AND
299+ partition_repr = %s AND
300+ dttm = %s ;
301301 """
302- mock_mysql_hook .return_value .run .assert_called_once_with (sql )
302+ mock_mysql_hook .return_value .run .assert_called_once_with (
303+ expected_sql ,
304+ parameters = (
305+ hive_stats_collection_operator .table ,
306+ mock_json_dumps .return_value ,
307+ hive_stats_collection_operator .dttm ,
308+ ),
309+ )
310+
311+ @patch ("airflow.providers.apache.hive.operators.hive_stats.MySqlHook" )
312+ @patch ("airflow.providers.apache.hive.operators.hive_stats.PrestoHook" )
313+ @patch ("airflow.providers.apache.hive.operators.hive_stats.HiveMetastoreHook" )
314+ def test_execute_rejects_invalid_table_identifier (
315+ self , mock_hive_metastore_hook , mock_presto_hook , mock_mysql_hook
316+ ):
317+ # The Presto SELECT interpolates the table identifier; the operator
318+ # rejects any value that does not match the <db>.<table> allowlist
319+ # so callers cannot smuggle whitespace or punctuation into the
320+ # identifier position.
321+ self .kwargs ["table" ] = "evil; DROP TABLE users--"
322+ with pytest .raises (AirflowException , match = "Invalid Hive table identifier" ):
323+ HiveStatsCollectionOperator (** self .kwargs ).execute (context = {})
324+
325+ @patch ("airflow.providers.apache.hive.operators.hive_stats.MySqlHook" )
326+ @patch ("airflow.providers.apache.hive.operators.hive_stats.PrestoHook" )
327+ @patch ("airflow.providers.apache.hive.operators.hive_stats.HiveMetastoreHook" )
328+ def test_execute_rejects_invalid_partition_column (
329+ self , mock_hive_metastore_hook , mock_presto_hook , mock_mysql_hook
330+ ):
331+ # Partition keys reach the SELECT clause as column identifiers and
332+ # are validated against the same allowlist.
333+ self .kwargs ["partition" ] = {"evil col" : "value" }
334+ with pytest .raises (AirflowException , match = "Invalid partition column name" ):
335+ HiveStatsCollectionOperator (** self .kwargs ).execute (context = {})
336+
337+ @patch ("airflow.providers.apache.hive.operators.hive_stats.json.dumps" )
338+ @patch ("airflow.providers.apache.hive.operators.hive_stats.MySqlHook" )
339+ @patch ("airflow.providers.apache.hive.operators.hive_stats.PrestoHook" )
340+ @patch ("airflow.providers.apache.hive.operators.hive_stats.HiveMetastoreHook" )
341+ def test_execute_parameterizes_mysql_bookkeeping_queries (
342+ self , mock_hive_metastore_hook , mock_presto_hook , mock_mysql_hook , mock_json_dumps
343+ ):
344+ # The bookkeeping SELECT and DELETE against hive_stats bind table,
345+ # partition_repr, and dttm as %s parameters instead of interpolating
346+ # them into the SQL body, so the operator does not rely on the
347+ # caller to escape those values.
348+ mock_hive_metastore_hook .return_value .get_table .return_value .sd .cols = [fake_col ]
349+ mock_mysql_hook .return_value .get_records .return_value = True
350+
351+ op = HiveStatsCollectionOperator (** self .kwargs )
352+ op .execute (context = {})
353+
354+ select_call = mock_mysql_hook .return_value .get_records .call_args
355+ delete_call = mock_mysql_hook .return_value .run .call_args
356+
357+ select_sql = select_call .args [0 ]
358+ delete_sql = delete_call .args [0 ]
359+ assert "%s" in select_sql
360+ assert "%s" in delete_sql
361+ assert op .table not in select_sql
362+ assert op .table not in delete_sql
363+
364+ expected_params = (op .table , mock_json_dumps .return_value , op .dttm )
365+ assert select_call .kwargs ["parameters" ] == expected_params
366+ assert delete_call .kwargs ["parameters" ] == expected_params
367+
368+ @patch ("airflow.providers.apache.hive.operators.hive_stats.MySqlHook" )
369+ @patch ("airflow.providers.apache.hive.operators.hive_stats.PrestoHook" )
370+ @patch ("airflow.providers.apache.hive.operators.hive_stats.HiveMetastoreHook" )
371+ def test_execute_parameterizes_presto_partition_values (
372+ self , mock_hive_metastore_hook , mock_presto_hook , mock_mysql_hook
373+ ):
374+ # Partition values cannot influence the Presto SQL body — they are
375+ # passed as bound parameters alongside the SELECT.
376+ mock_hive_metastore_hook .return_value .get_table .return_value .sd .cols = [fake_col ]
377+ mock_mysql_hook .return_value .get_records .return_value = False
378+
379+ self .kwargs ["partition" ] = {"col" : "value" }
380+ HiveStatsCollectionOperator (** self .kwargs ).execute (context = {})
381+
382+ presto_call = mock_presto_hook .return_value .get_first .call_args
383+ assert "col = %s" in presto_call .args [0 ]
384+ assert "'value'" not in presto_call .args [0 ]
385+ assert presto_call .kwargs ["parameters" ] == ("value" ,)
303386
304387 @pytest .mark .skipif (
305388 "AIRFLOW_RUNALL_TESTS" not in os .environ , reason = "Skipped because AIRFLOW_RUNALL_TESTS is not set"
@@ -326,23 +409,27 @@ def test_runs_for_hive_stats(self, mock_hive_metastore_hook):
326409 op .run (start_date = DEFAULT_DATE , end_date = DEFAULT_DATE , ignore_ti_state = True )
327410
328411 select_count_query = (
329- "SELECT COUNT(*) AS __count FROM airflow.static_babynames_partitioned WHERE ds = '2015-01-01' ;"
412+ "SELECT COUNT(*) AS __count FROM airflow.static_babynames_partitioned WHERE ds = %s ;"
330413 )
331- mock_presto_hook .get_first .assert_called_with (hql = select_count_query )
414+ presto_call = mock_presto_hook .get_first .call_args
415+ actual_presto_query = re .sub (r"\s{2,}" , " " , presto_call .args [0 ]).strip ()
416+ assert actual_presto_query == select_count_query
417+ assert presto_call .kwargs ["parameters" ] == ("2015-01-01" ,)
332418
333419 expected_stats_select_query = (
334- "SELECT 1 "
335- "FROM hive_stats "
336- "WHERE table_name='airflow.static_babynames_partitioned' "
337- ' AND partition_repr=\' {"ds": "2015-01-01"}\' '
338- " AND dttm='2015-01-01T00:00:00+00:00' "
339- "LIMIT 1;"
420+ "SELECT 1 FROM hive_stats WHERE table_name = %s AND partition_repr = %s AND dttm = %s LIMIT 1;"
340421 )
341422
342- raw_stats_select_query = mock_mysql_hook .get_records .call_args_list [0 ][0 ][0 ]
423+ stats_select_call = mock_mysql_hook .get_records .call_args_list [0 ]
424+ raw_stats_select_query = stats_select_call [0 ][0 ]
343425 actual_stats_select_query = re .sub (r"\s{2,}" , " " , raw_stats_select_query ).strip ()
344426
345427 assert expected_stats_select_query == actual_stats_select_query
428+ assert stats_select_call .kwargs ["parameters" ] == (
429+ "airflow.static_babynames_partitioned" ,
430+ '{"ds": "2015-01-01"}' ,
431+ "2015-01-01T00:00:00+00:00" ,
432+ )
346433
347434 insert_rows_val = [
348435 (
0 commit comments