diff --git a/src/comproscanner/comproscanner.py b/src/comproscanner/comproscanner.py index d780e1ed..a2a85c7c 100644 --- a/src/comproscanner/comproscanner.py +++ b/src/comproscanner/comproscanner.py @@ -54,11 +54,13 @@ class ComProScanner: def __init__(self, main_property_keyword: str = None): - self.main_property_keyword = main_property_keyword - if self.main_property_keyword is None: - raise ValueErrorHandler( - "Please provide a main property keyword to proceed." - ) + if main_property_keyword is None: + raise ValueErrorHandler( + "Please provide a main property keyword to proceed." + ) + + self.main_property_keyword = main_property_keyword.replace(" ", "_") + self.main_property_search_keyword = self.main_property_keyword.replace("_", " ") def collect_metadata( self, @@ -501,7 +503,7 @@ def extract_composition_property_data( f"results/extracted_data/{self.main_property_keyword}/related_figures" ) if materials_data_identifier_query is None: - materials_data_identifier_query = f"Is there any material chemical composition and corresponding {self.main_property_keyword} value mentioned in the paper? Give one word answer. Either yes or no." + materials_data_identifier_query = f"Is there any material chemical composition and corresponding {self.main_property_search_keyword} value mentioned in the paper? Give one word answer. Either yes or no." preparator = MatPropDataPreparator( main_property_keyword=self.main_property_keyword, main_extraction_keyword=main_extraction_keyword, diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index 6ca8f42b..2248ba44 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -115,8 +115,16 @@ def _construct_url(self, cursor, year, query, special_query): Returns: str: The constructed URL """ - base = f"{self.base_url}PUBYEAR+%3D+{year}+{query}" - url = base + (f"+{special_query}" if special_query else "") + "&count=200" + search_query = query.replace("_", " ") +encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) +encoded_special_query = ( + urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) + if special_query + else "" +) + + base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" + url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" url += f"&cursor={cursor}" return url