From e9fddc930d67e6784459ad40605c52623d673b9a Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Wed, 17 Jun 2026 21:05:28 -0600 Subject: [PATCH 1/2] fix(metadata): encode multi-word Scopus queries Encode main and special Scopus query terms after converting spaces to AND for multi-word keyword handling. --- .../metadata_extractor/fetch_metadata.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index 6ca8f42b..a4f50ca3 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -115,8 +115,15 @@ def _construct_url(self, cursor, year, query, special_query): Returns: str: The constructed URL """ - base = f"{self.base_url}PUBYEAR+%3D+{year}+{query}" - url = base + (f"+{special_query}" if special_query else "") + "&count=200" + encoded_query = urllib.parse.quote(query.replace(" ", " AND ")) + encoded_special_query = ( + urllib.parse.quote(special_query.replace(" ", " AND ")) + if special_query + else "" + ) + + base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" + url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" url += f"&cursor={cursor}" return url From dd605f7a02f306e553b8073bfd007a1c46d0bbe3 Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Thu, 18 Jun 2026 22:21:43 -0600 Subject: [PATCH 2/2] fix(metadata): normalize multi-word keyword handling --- src/comproscanner/comproscanner.py | 14 ++++++++------ .../metadata_extractor/fetch_metadata.py | 13 +++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/comproscanner/comproscanner.py b/src/comproscanner/comproscanner.py index d780e1ed..a2a85c7c 100644 --- a/src/comproscanner/comproscanner.py +++ b/src/comproscanner/comproscanner.py @@ -54,11 +54,13 @@ class ComProScanner: def __init__(self, main_property_keyword: str = None): - self.main_property_keyword = main_property_keyword - if self.main_property_keyword is None: - raise ValueErrorHandler( - "Please provide a main property keyword to proceed." - ) + if main_property_keyword is None: + raise ValueErrorHandler( + "Please provide a main property keyword to proceed." + ) + + self.main_property_keyword = main_property_keyword.replace(" ", "_") + self.main_property_search_keyword = self.main_property_keyword.replace("_", " ") def collect_metadata( self, @@ -501,7 +503,7 @@ def extract_composition_property_data( f"results/extracted_data/{self.main_property_keyword}/related_figures" ) if materials_data_identifier_query is None: - materials_data_identifier_query = f"Is there any material chemical composition and corresponding {self.main_property_keyword} value mentioned in the paper? Give one word answer. Either yes or no." + materials_data_identifier_query = f"Is there any material chemical composition and corresponding {self.main_property_search_keyword} value mentioned in the paper? Give one word answer. Either yes or no." preparator = MatPropDataPreparator( main_property_keyword=self.main_property_keyword, main_extraction_keyword=main_extraction_keyword, diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index a4f50ca3..2248ba44 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -115,12 +115,13 @@ def _construct_url(self, cursor, year, query, special_query): Returns: str: The constructed URL """ - encoded_query = urllib.parse.quote(query.replace(" ", " AND ")) - encoded_special_query = ( - urllib.parse.quote(special_query.replace(" ", " AND ")) - if special_query - else "" - ) + search_query = query.replace("_", " ") +encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) +encoded_special_query = ( + urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) + if special_query + else "" +) base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200"