From d658d931fe6ee0a8916b148da975022ea037ce4f Mon Sep 17 00:00:00 2001 From: WolffRuoff Date: Wed, 11 Mar 2026 16:18:30 -0400 Subject: [PATCH 1/3] fix(fedex): add session warm-up and JSON API interception to bypass bot detection FedEx tracking was failing with TargetClosedError during wait_for_selector. The tracking page is fronted by a bot protection service that terminates the browser context before rendering when hit cold. Two-part fix: - Visit the FedEx homepage (domcontentloaded) before the tracking URL to establish a real-looking session with cookies/TLS fingerprint - Intercept the internal /trackingCal/track JSON API response fired during page navigation and parse it directly, skipping wait_for_selector entirely - HTML parsing retained as a fallback if the JSON response is not captured Co-Authored-By: Claude Sonnet 4.6 --- scraper/carriers/fedex.py | 89 ++++++++++++++++- scraper/tests/test_carriers_fedex.py | 144 ++++++++++++++++++++++++++- 2 files changed, 227 insertions(+), 6 deletions(-) diff --git a/scraper/carriers/fedex.py b/scraper/carriers/fedex.py index f7b7181..c81f37c 100644 --- a/scraper/carriers/fedex.py +++ b/scraper/carriers/fedex.py @@ -84,11 +84,47 @@ async def async_track( tracking_number=tracking_number, ) - url = self.tracking_url(tracking_number) - html = await self._get_page_content(browser, url, WAIT_SELECTOR) - self._parse_tracking_page(html, result) - result.last_updated = datetime.now() - return result + context = await browser.new_context() + page = await context.new_page() + + try: + _api_data: dict | None = None + + async def _on_response(response) -> None: + nonlocal _api_data + if _api_data is None and "trackingCal" in response.url: + try: + _api_data = await response.json() + except Exception: + pass + + page.on("response", _on_response) + + # Warm-up: visit homepage to establish session/cookies before + # hitting the tracking page (mitigates Akamai bot detection) + await page.goto( + "https://www.fedex.com/en-us/home.html", + wait_until="domcontentloaded", + timeout=30000, + ) + + url = self.tracking_url(tracking_number) + await page.goto(url, wait_until="networkidle", timeout=30000) + + if _api_data is not None: + # JSON path: API response captured during navigation — skip + # wait_for_selector entirely to avoid TargetClosedError + self._parse_tracking_json(_api_data, result) + else: + # HTML fallback: wait for DOM element then parse page content + await page.wait_for_selector(WAIT_SELECTOR, timeout=45000) + html = await page.content() + self._parse_tracking_page(html, result) + + result.last_updated = datetime.now() + return result + finally: + await context.close() def _parse_tracking_page(self, html: str, result: TrackingResult) -> None: """Parse the FedEx tracking page HTML.""" @@ -123,6 +159,49 @@ def _parse_tracking_page(self, html: str, result: TrackingResult) -> None: if event: result.events.append(event) + def _parse_tracking_json(self, data: dict, result: TrackingResult) -> None: + """Parse FedEx internal tracking JSON from the /trackingCal/track API.""" + try: + pkg = data["TrackPackagesResponse"]["packageList"][0] + except (KeyError, IndexError): + return + + key_status = pkg.get("keyStatus", "") + if key_status: + result.raw_status = key_status + result.status = self._map_status(key_status) + + eta_str = pkg.get("displayEstDeliveryDateTime", "") + if eta_str: + # Strip time component if present (e.g. "01/15/2025 00:00:00") + result.estimated_delivery = self._parse_date(eta_str.split(" ")[0]) + + for scan in pkg.get("scanEventList", []): + description = scan.get("eventDescription", "") + location = scan.get("scanLocation", "") + date_str = scan.get("date", "") + time_str = scan.get("time", "") + + timestamp = datetime.now() + if date_str: + combined = f"{date_str} {time_str}".strip() + try: + timestamp = datetime.strptime(combined, "%m/%d/%Y %I:%M %p") + except ValueError: + parsed = self._parse_date(date_str) + if parsed: + timestamp = parsed + + status = self._map_status(description) + result.events.append( + TrackingEvent( + timestamp=timestamp, + location=location, + description=description, + status=status, + ) + ) + def _map_status(self, raw_status: str) -> TrackingStatus: """Map a raw status string to TrackingStatus.""" lower = raw_status.lower() diff --git a/scraper/tests/test_carriers_fedex.py b/scraper/tests/test_carriers_fedex.py index e50cf0f..e6832fb 100644 --- a/scraper/tests/test_carriers_fedex.py +++ b/scraper/tests/test_carriers_fedex.py @@ -2,6 +2,8 @@ from __future__ import annotations +from unittest.mock import AsyncMock, MagicMock + import pytest from scraper.carriers.base import TrackingResult @@ -16,6 +18,53 @@ def provider(): VALID_TRACKING_12 = "123456789012" +FEDEX_API_JSON_DELIVERED = { + "TrackPackagesResponse": { + "packageList": [ + { + "keyStatus": "Delivered", + "displayEstDeliveryDateTime": "01/15/2025 00:00:00", + "scanEventList": [ + { + "date": "01/15/2025", + "time": "10:00 AM", + "eventDescription": "Delivered", + "scanLocation": "Springfield, IL 62701 US", + "status": "DL", + }, + { + "date": "01/15/2025", + "time": "06:00 AM", + "eventDescription": "On FedEx vehicle for delivery", + "scanLocation": "Springfield, IL 62701 US", + "status": "OD", + }, + ], + } + ] + } +} + +FEDEX_API_JSON_IN_TRANSIT = { + "TrackPackagesResponse": { + "packageList": [ + { + "keyStatus": "In transit", + "displayEstDeliveryDateTime": "01/16/2025 00:00:00", + "scanEventList": [ + { + "date": "01/14/2025", + "time": "02:30 PM", + "eventDescription": "Departed FedEx location", + "scanLocation": "Memphis, TN 38118 US", + "status": "DP", + } + ], + } + ] + } +} + class TestValidateTrackingNumber: def test_valid_12_digit(self, provider): @@ -81,16 +130,109 @@ def test_not_found_stays_unknown(self, provider, fedex_not_found_html): assert result.events == [] +class TestParseTrackingJson: + def test_delivered_status(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + provider._parse_tracking_json(FEDEX_API_JSON_DELIVERED, result) + assert result.status == TrackingStatus.DELIVERED + assert result.raw_status == "Delivered" + + def test_delivered_estimated_delivery(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + provider._parse_tracking_json(FEDEX_API_JSON_DELIVERED, result) + assert result.estimated_delivery is not None + assert result.estimated_delivery.month == 1 + assert result.estimated_delivery.day == 15 + assert result.estimated_delivery.year == 2025 + + def test_delivered_events(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + provider._parse_tracking_json(FEDEX_API_JSON_DELIVERED, result) + assert len(result.events) == 2 + assert result.events[0].description == "Delivered" + assert result.events[0].location == "Springfield, IL 62701 US" + assert result.events[0].status == TrackingStatus.DELIVERED + + def test_in_transit_status(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + provider._parse_tracking_json(FEDEX_API_JSON_IN_TRANSIT, result) + assert result.status == TrackingStatus.IN_TRANSIT + + def test_in_transit_events(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + provider._parse_tracking_json(FEDEX_API_JSON_IN_TRANSIT, result) + assert len(result.events) == 1 + assert result.events[0].description == "Departed FedEx location" + + def test_empty_package_list(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + data = {"TrackPackagesResponse": {"packageList": []}} + provider._parse_tracking_json(data, result) + assert result.status == TrackingStatus.UNKNOWN + assert result.events == [] + + def test_missing_key(self, provider): + result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") + provider._parse_tracking_json({}, result) + assert result.status == TrackingStatus.UNKNOWN + + class TestAsyncTrack: @pytest.mark.asyncio - async def test_successful_tracking( + async def test_json_path_two_goto_calls(self, provider, mock_browser): + """When API response is intercepted, result is populated via JSON path.""" + browser, mock_page = mock_browser + + captured_callbacks: list = [] + mock_page.on = MagicMock( + side_effect=lambda event, cb: captured_callbacks.append(cb) + if event == "response" + else None + ) + + mock_response = AsyncMock() + mock_response.url = "https://www.fedex.com/trackingCal/track" + mock_response.json = AsyncMock(return_value=FEDEX_API_JSON_DELIVERED) + + call_count = 0 + + async def goto_side_effect(url, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 2 and captured_callbacks: + await captured_callbacks[0](mock_response) + + mock_page.goto.side_effect = goto_side_effect + + result = await provider.async_track(VALID_TRACKING_12, browser) + + assert mock_page.goto.call_count == 2 + # First call = homepage warm-up + first_url = mock_page.goto.call_args_list[0].args[0] + assert "home" in first_url + # Second call = tracking URL + second_url = mock_page.goto.call_args_list[1].args[0] + assert VALID_TRACKING_12 in second_url + + assert result.carrier == Carrier.FEDEX + assert result.status == TrackingStatus.DELIVERED + assert len(result.events) == 2 + assert result.last_updated is not None + # wait_for_selector should NOT be called in the JSON path + mock_page.wait_for_selector.assert_not_called() + + @pytest.mark.asyncio + async def test_html_fallback_when_no_api_response( self, provider, mock_browser, fedex_delivered_html ): + """When no API response is intercepted, falls back to HTML parsing.""" browser, mock_page = mock_browser mock_page.content.return_value = fedex_delivered_html result = await provider.async_track(VALID_TRACKING_12, browser) + assert mock_page.goto.call_count == 2 + mock_page.wait_for_selector.assert_called_once() assert result.carrier == Carrier.FEDEX assert result.status == TrackingStatus.DELIVERED assert len(result.events) == 3 From 59ebca061d130959476f143ab7844205addb2f68 Mon Sep 17 00:00:00 2001 From: WolffRuoff Date: Wed, 11 Mar 2026 17:41:02 -0400 Subject: [PATCH 2/3] fix(fedex): use asyncio.Event to correctly race API response vs timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes discovered during live testing: 1. Tracking API is at api.fedex.com/track/v2/shipments (not /trackingCal/track) — update URL filter to match "api.fedex.com/track/" instead of "trackingCal" 2. API response JSON uses output.packages[0] structure with mainStatus/ estDeliveryDt/scanEventList fields — rewrite _parse_tracking_json accordingly 3. Race condition: API response fires DURING wait_for_selector (not during goto), so using wait_for_selector or page.wait_for_response as a blocking call misses it. Fix: use asyncio.Event set by _on_response; asyncio.wait_for races the event against a 45s timeout, then parses JSON if captured. Co-Authored-By: Claude Sonnet 4.6 --- scraper/carriers/fedex.py | 75 ++++++++++++++++++---------- scraper/tests/test_carriers_fedex.py | 39 +++++++-------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/scraper/carriers/fedex.py b/scraper/carriers/fedex.py index c81f37c..962db0a 100644 --- a/scraper/carriers/fedex.py +++ b/scraper/carriers/fedex.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import logging import re from datetime import datetime @@ -89,34 +90,49 @@ async def async_track( try: _api_data: dict | None = None + _api_event = asyncio.Event() async def _on_response(response) -> None: nonlocal _api_data - if _api_data is None and "trackingCal" in response.url: + url = response.url + if _api_data is None and ( + "trackingCal" in url + or "api.fedex.com/track/" in url + ): try: _api_data = await response.json() - except Exception: - pass + _LOGGER.info("FedEx: captured API response from %s", url) + _api_event.set() + except Exception as exc: + _LOGGER.warning("FedEx: failed to parse %s: %s", url, exc) page.on("response", _on_response) # Warm-up: visit homepage to establish session/cookies before - # hitting the tracking page (mitigates Akamai bot detection) + # hitting the tracking page (mitigates bot detection) await page.goto( "https://www.fedex.com/en-us/home.html", wait_until="domcontentloaded", timeout=30000, ) - url = self.tracking_url(tracking_number) - await page.goto(url, wait_until="networkidle", timeout=30000) + tracking_url = self.tracking_url(tracking_number) + # domcontentloaded so we don't block on networkidle before the + # tracking API fires; the asyncio.Event below handles the wait + await page.goto(tracking_url, wait_until="domcontentloaded", timeout=30000) + + # Wait for the tracking API response — may fire during or after goto + if not _api_event.is_set(): + try: + await asyncio.wait_for(_api_event.wait(), timeout=45.0) + except asyncio.TimeoutError: + _LOGGER.warning("FedEx: API response not captured for %s", tracking_number) if _api_data is not None: - # JSON path: API response captured during navigation — skip - # wait_for_selector entirely to avoid TargetClosedError + # JSON path: skip wait_for_selector entirely self._parse_tracking_json(_api_data, result) else: - # HTML fallback: wait for DOM element then parse page content + # HTML fallback await page.wait_for_selector(WAIT_SELECTOR, timeout=45000) html = await page.content() self._parse_tracking_page(html, result) @@ -160,37 +176,42 @@ def _parse_tracking_page(self, html: str, result: TrackingResult) -> None: result.events.append(event) def _parse_tracking_json(self, data: dict, result: TrackingResult) -> None: - """Parse FedEx internal tracking JSON from the /trackingCal/track API.""" + """Parse FedEx tracking JSON from the api.fedex.com/track/v2/shipments API.""" try: - pkg = data["TrackPackagesResponse"]["packageList"][0] + pkg = data["output"]["packages"][0] except (KeyError, IndexError): return - key_status = pkg.get("keyStatus", "") - if key_status: - result.raw_status = key_status - result.status = self._map_status(key_status) + # mainStatus is the human-readable status ("Picked up", "Delivered", etc.) + main_status = pkg.get("mainStatus", "") + if main_status: + result.raw_status = main_status + result.status = self._map_status(main_status) - eta_str = pkg.get("displayEstDeliveryDateTime", "") - if eta_str: - # Strip time component if present (e.g. "01/15/2025 00:00:00") - result.estimated_delivery = self._parse_date(eta_str.split(" ")[0]) + # estDeliveryDt is ISO-8601 — reliable for parsing; strip timezone for naive datetime + est_dt = pkg.get("estDeliveryDt", "") + if est_dt: + try: + result.estimated_delivery = datetime.fromisoformat(est_dt).replace(tzinfo=None) + except ValueError: + pass for scan in pkg.get("scanEventList", []): - description = scan.get("eventDescription", "") + description = scan.get("status", "") location = scan.get("scanLocation", "") - date_str = scan.get("date", "") - time_str = scan.get("time", "") + date_str = scan.get("date", "") # "YYYY-MM-DD" + time_str = scan.get("time", "") # "HH:MM:SS" timestamp = datetime.now() if date_str: - combined = f"{date_str} {time_str}".strip() + combined = f"{date_str}T{time_str}" if time_str else date_str try: - timestamp = datetime.strptime(combined, "%m/%d/%Y %I:%M %p") + timestamp = datetime.fromisoformat(combined) except ValueError: - parsed = self._parse_date(date_str) - if parsed: - timestamp = parsed + try: + timestamp = datetime.strptime(date_str, "%Y-%m-%d") + except ValueError: + pass status = self._map_status(description) result.events.append( diff --git a/scraper/tests/test_carriers_fedex.py b/scraper/tests/test_carriers_fedex.py index e6832fb..b7bf7ef 100644 --- a/scraper/tests/test_carriers_fedex.py +++ b/scraper/tests/test_carriers_fedex.py @@ -19,25 +19,23 @@ def provider(): VALID_TRACKING_12 = "123456789012" FEDEX_API_JSON_DELIVERED = { - "TrackPackagesResponse": { - "packageList": [ + "output": { + "packages": [ { - "keyStatus": "Delivered", - "displayEstDeliveryDateTime": "01/15/2025 00:00:00", + "mainStatus": "Delivered", + "estDeliveryDt": "2025-01-15T00:00:00+00:00", "scanEventList": [ { - "date": "01/15/2025", - "time": "10:00 AM", - "eventDescription": "Delivered", + "date": "2025-01-15", + "time": "10:00:00", + "status": "Delivered", "scanLocation": "Springfield, IL 62701 US", - "status": "DL", }, { - "date": "01/15/2025", - "time": "06:00 AM", - "eventDescription": "On FedEx vehicle for delivery", + "date": "2025-01-15", + "time": "06:00:00", + "status": "On FedEx vehicle for delivery", "scanLocation": "Springfield, IL 62701 US", - "status": "OD", }, ], } @@ -46,18 +44,17 @@ def provider(): } FEDEX_API_JSON_IN_TRANSIT = { - "TrackPackagesResponse": { - "packageList": [ + "output": { + "packages": [ { - "keyStatus": "In transit", - "displayEstDeliveryDateTime": "01/16/2025 00:00:00", + "mainStatus": "In transit", + "estDeliveryDt": "2025-01-16T00:00:00+00:00", "scanEventList": [ { - "date": "01/14/2025", - "time": "02:30 PM", - "eventDescription": "Departed FedEx location", + "date": "2025-01-14", + "time": "14:30:00", + "status": "Departed FedEx location", "scanLocation": "Memphis, TN 38118 US", - "status": "DP", } ], } @@ -166,7 +163,7 @@ def test_in_transit_events(self, provider): def test_empty_package_list(self, provider): result = TrackingResult(carrier=Carrier.FEDEX, tracking_number="TEST") - data = {"TrackPackagesResponse": {"packageList": []}} + data = {"output": {"packages": []}} provider._parse_tracking_json(data, result) assert result.status == TrackingStatus.UNKNOWN assert result.events == [] From bf39ed9edf9dfdc5be0adc62f0090ba93005d4cc Mon Sep 17 00:00:00 2001 From: WolffRuoff Date: Wed, 11 Mar 2026 17:47:39 -0400 Subject: [PATCH 3/3] chore(fedex): remove dead DESCRIPTION_MAPPING constant and its tests Co-Authored-By: Claude Sonnet 4.6 --- scraper/carriers/fedex.py | 11 ----------- scraper/tests/test_carriers_fedex.py | 7 +------ 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/scraper/carriers/fedex.py b/scraper/carriers/fedex.py index 962db0a..4f0596b 100644 --- a/scraper/carriers/fedex.py +++ b/scraper/carriers/fedex.py @@ -39,17 +39,6 @@ "clearance delay": TrackingStatus.EXCEPTION, } -DESCRIPTION_MAPPING: dict[str, TrackingStatus] = { - "Picked Up": TrackingStatus.PRE_TRANSIT, - "Shipment information sent to FedEx": TrackingStatus.PRE_TRANSIT, - "In transit": TrackingStatus.IN_TRANSIT, - "At local FedEx facility": TrackingStatus.IN_TRANSIT, - "On FedEx vehicle for delivery": TrackingStatus.OUT_FOR_DELIVERY, - "Out for Delivery": TrackingStatus.OUT_FOR_DELIVERY, - "Delivered": TrackingStatus.DELIVERED, - "Delivery exception": TrackingStatus.EXCEPTION, -} - class FedExProvider(CarrierProvider): """FedEx tracking provider via web scraping.""" diff --git a/scraper/tests/test_carriers_fedex.py b/scraper/tests/test_carriers_fedex.py index b7bf7ef..8261c31 100644 --- a/scraper/tests/test_carriers_fedex.py +++ b/scraper/tests/test_carriers_fedex.py @@ -7,7 +7,7 @@ import pytest from scraper.carriers.base import TrackingResult -from scraper.carriers.fedex import DESCRIPTION_MAPPING, STATUS_MAPPING, FedExProvider +from scraper.carriers.fedex import STATUS_MAPPING, FedExProvider from scraper.const import Carrier, TrackingStatus @@ -260,8 +260,3 @@ def test_pre_transit(self): def test_exception(self): assert STATUS_MAPPING["delivery exception"] == TrackingStatus.EXCEPTION - def test_description_delivered(self): - assert DESCRIPTION_MAPPING["Delivered"] == TrackingStatus.DELIVERED - - def test_description_in_transit(self): - assert DESCRIPTION_MAPPING["In transit"] == TrackingStatus.IN_TRANSIT