From e09955c48542a4b76250fcad294d1700a00a9f1b Mon Sep 17 00:00:00 2001 From: mithun Date: Sun, 28 Apr 2019 16:28:11 -0700 Subject: [PATCH 1/9] changed cookielib to cookiejar --- .idea/workspace.xml | 594 +++++++++++++++++--------------------------- pipInstalls.sh | 2 +- 2 files changed, 230 insertions(+), 366 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 0501cf2..410eea5 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,10 +2,10 @@ - + + \ No newline at end of file diff --git a/main/src/scraper.py b/main/src/scraper.py index b54032a..9bebf89 100755 --- a/main/src/scraper.py +++ b/main/src/scraper.py @@ -8,6 +8,7 @@ from email.MIMEText import MIMEText import mechanize import cookielib +import ssl #uncomment these 2 lines of code if you get the below error. Some unicode encoding stuff @@ -204,6 +205,15 @@ def parseGResults(myQS): br.addheaders = [('User-agent', 'Chrome')] + try: + _create_unverified_https_context = ssl._create_unverified_context + except AttributeError: + # Legacy Python that doesn't verify HTTPS certificates by default + pass + else: + # Handle target environment that doesn't support HTTPS verification + ssl._create_default_https_context = _create_unverified_https_context + # The site we will navigate into, handling it's session br.open(myQS) From a10f7df563692969fb97e5500292a009e1683f26 Mon Sep 17 00:00:00 2001 From: mithun Date: Wed, 8 May 2019 21:50:25 -0700 Subject: [PATCH 3/9] checked in 2019. code works fine as long as you follow the instructions --- main/src/scraper.py | 2 +- pipInstalls.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/main/src/scraper.py b/main/src/scraper.py index 9bebf89..966d1aa 100755 --- a/main/src/scraper.py +++ b/main/src/scraper.py @@ -21,7 +21,7 @@ queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=23&agehigh=99&miles=10&contacted=2&cmdSearch=Refine+Matches' firstQueryString='http://www.pof.com/' numberOfGoogleResults=1000 -stubMessage='Hey, nice profile. Must say you have a very nice smile. Are you from Tucson originally?' +stubMessage='Hey, nice profile. You also have a very nice smile. Are you from Tucson originally?' startValue=1 stubUrlForPof='http://www.pof.com/' stubUrlForTucsonCLInnerpages='http://tucson.craigslist.org/' diff --git a/pipInstalls.sh b/pipInstalls.sh index b8fec39..77a3cc5 100755 --- a/pipInstalls.sh +++ b/pipInstalls.sh @@ -4,4 +4,5 @@ pip install PyPDF2 pip install html2text pip install mechanize pip install cookiejar +pip install lxml From b315e25f70c7866d584160579080d25267348980 Mon Sep 17 00:00:00 2001 From: mithun Date: Wed, 8 May 2019 22:08:45 -0700 Subject: [PATCH 4/9] tested. automatically sent messages to 40 people. Next todo: send to search results --- main/src/scraper.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/main/src/scraper.py b/main/src/scraper.py index 966d1aa..06bfd46 100755 --- a/main/src/scraper.py +++ b/main/src/scraper.py @@ -18,7 +18,8 @@ stubFilename='carIdHashTable.json' queryStringStubForTucson='http://tucson.craigslist.org/search/cto?' -queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=23&agehigh=99&miles=10&contacted=2&cmdSearch=Refine+Matches' +queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=25&agehigh=35&miles=10&contacted=2&cmdSearch=Refine+Matches' +queryStringForBasicSearchPage='https://www.pof.com/basicsearch.aspx' firstQueryString='http://www.pof.com/' numberOfGoogleResults=1000 stubMessage='Hey, nice profile. You also have a very nice smile. Are you from Tucson originally?' @@ -39,7 +40,7 @@ #turn this to true, if pushing to run on chung.cs.arizona.edu isRunningOnServer=False; firstTimeRun=False; - +useBasicSearchPage=False if(firstTimeRun): bodyOfEmail="Hi, \n Here is a list of all the cars found today in Craigslist. This is the very first email of craigslist scraping for used cars. Tomorrow onwards you will be shown only new hits that were not sent today. These are the parameters used for this query:\n\n" @@ -234,7 +235,11 @@ def parseGResults(myQS): try: #note:queryStringForViewMatches already contains the clause: havent contacted before. You dont want to spam #someone you have already contacted and then get blocked - url=br.open(queryStringForViewMatches) + if(useBasicSearchPage): + url=br.open(queryStringForBasicSearchPage) + else: + url = br.open(queryStringForViewMatches) + #url = urllib2.urlopen(queryStringToSearch) except urllib2.HTTPError, e: print('HTTPError = ' + str(e.code)) @@ -253,7 +258,9 @@ def parseGResults(myQS): # parse the content into a format that soup understands soup = bs4.BeautifulSoup(content, "lxml") # for each of the hyperlinks in the page + counter=0 for link in soup.find_all('a'): + #print(link) classResult = link.get('class') if (classResult != None): @@ -281,6 +288,7 @@ def parseGResults(myQS): # submit the text br.submit() + counter=counter+1 print("sent message to "+profilePageUrl) except urllib2.HTTPError, e: @@ -295,6 +303,7 @@ def parseGResults(myQS): #else: #profilePageDetails = profilePage.read() + print("done sending messages to "+str(counter) +"people") sys.exit(1) From c59e087ffed9aa251c1ce242f08d539d5d76e56a Mon Sep 17 00:00:00 2001 From: mithun Date: Wed, 8 May 2019 22:12:45 -0700 Subject: [PATCH 5/9] changed the input hello text --- main/src/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/src/scraper.py b/main/src/scraper.py index 06bfd46..1a72f7c 100755 --- a/main/src/scraper.py +++ b/main/src/scraper.py @@ -22,7 +22,7 @@ queryStringForBasicSearchPage='https://www.pof.com/basicsearch.aspx' firstQueryString='http://www.pof.com/' numberOfGoogleResults=1000 -stubMessage='Hey, nice profile. You also have a very nice smile. Are you from Tucson originally?' +stubMessage='Hey, nice profile. Love your smile. Are you from Tucson originally?' startValue=1 stubUrlForPof='http://www.pof.com/' stubUrlForTucsonCLInnerpages='http://tucson.craigslist.org/' From 9ffb0cce4e659884107e95695a7cc542827ed852 Mon Sep 17 00:00:00 2001 From: mithun Date: Sun, 12 May 2019 20:31:58 -0700 Subject: [PATCH 6/9] will send from basic search page --- main/src/scraper.py | 213 +++++++++++++++++++++++++++++--------------- 1 file changed, 142 insertions(+), 71 deletions(-) diff --git a/main/src/scraper.py b/main/src/scraper.py index 1a72f7c..960f369 100755 --- a/main/src/scraper.py +++ b/main/src/scraper.py @@ -25,6 +25,7 @@ stubMessage='Hey, nice profile. Love your smile. Are you from Tucson originally?' startValue=1 stubUrlForPof='http://www.pof.com/' +stubUrlForBasicSearchPage='http://www.pof.com/' stubUrlForTucsonCLInnerpages='http://tucson.craigslist.org/' stubUrlForPhxCLInnerpages='http://phoenix.craigslist.org/' username="" @@ -40,7 +41,7 @@ #turn this to true, if pushing to run on chung.cs.arizona.edu isRunningOnServer=False; firstTimeRun=False; -useBasicSearchPage=False +useBasicSearchPage=True if(firstTimeRun): bodyOfEmail="Hi, \n Here is a list of all the cars found today in Craigslist. This is the very first email of craigslist scraping for used cars. Tomorrow onwards you will be shown only new hits that were not sent today. These are the parameters used for this query:\n\n" @@ -179,6 +180,142 @@ def readFromJsonToHashtable(filename): carIdHashTable = {} return htMyTable +def send_from_basic_search_page(br,queryStringForViewMatches): + already_sent_today={'viewprofile.aspx?profile_id=82509149':1} + try: + url = br.open(queryStringForViewMatches) + except urllib2.HTTPError, e: + print('HTTPError = ' + str(e.code)) + except urllib2.URLError, e: + print('URLError = ' + str(e.reason)) + except httplib.HTTPException, e: + print('HTTPException') + except Exception: + import traceback + print('generic exception: ' + traceback.format_exc()) + else: + content = url.read() + + print("succesfully logged into pof") + # parse the content into a format that soup understands + soup = bs4.BeautifulSoup(content, "lxml") + # for each of the hyperlinks in the page + counter = 0 + for link in soup.find_all('a'): + # print(link) + classResult = link.get('class') + if (classResult != None): + # if the class exists, get the link, if its not null + linkToNextPage = link.get('href') + if (linkToNextPage != None): + #check if this hyperlink has a profile id + if("profile_id" in linkToNextPage): + #profile_id=74824023 is my own id + if not ("profile_id=74824023" in linkToNextPage): + profilePageUrl = stubUrlForBasicSearchPage + linkToNextPage + # print(profilePageUrl) + # once you get the link to the person'as profile, open and go into that page. + else: + continue + else: + continue + + try: + br.open(profilePageUrl) + # for f in br.forms(): + # print f + + # Select the first form (the first form is the quick message form) + br.select_form(nr=0) + + # User credentials + br.form['message'] = stubMessage + + # submit the text + if not(linkToNextPage in already_sent_today.keys()): + br.submit() + already_sent_today[linkToNextPage]=1 + else: + continue + + counter = counter + 1 + print("sent message to " + profilePageUrl) + + except Exception: + import traceback + + print('generic exception: ' + traceback.format_exc()) + # else: + # profilePageDetails = profilePage.read() + + print("done sending messages to " + str(counter) + "people") + sys.exit(1) + + +def send_from_view_matches_page(br,queryStringForViewMatches): + try: + url = br.open(queryStringForViewMatches) + except urllib2.HTTPError, e: + print('HTTPError = ' + str(e.code)) + except urllib2.URLError, e: + print('URLError = ' + str(e.reason)) + except httplib.HTTPException, e: + print('HTTPException') + except Exception: + import traceback + print('generic exception: ' + traceback.format_exc()) + else: + content = url.read() + + print("succesfully logged into pof") + # parse the content into a format that soup understands + soup = bs4.BeautifulSoup(content, "lxml") + # for each of the hyperlinks in the page + counter = 0 + for link in soup.find_all('a'): + # print(link) + classResult = link.get('class') + if (classResult != None): + if ("mi" in classResult): + # if the class exists, get the link, if its not null + linkToNextPage = link.get('href') + if (linkToNextPage != None): + print("\n") + profilePageUrl = stubUrlForPof + linkToNextPage + # print(profilePageUrl) + # once you get the link to the person'as profile, open and go into that page. + + try: + br.open(profilePageUrl) + # for f in br.forms(): + # print f + + # Select the first form (the first form is the quick message form) + br.select_form(nr=0) + + # User credentials + br.form['message'] = stubMessage + + # submit the text + # br.submit() + counter = counter + 1 + print("sent message to " + profilePageUrl) + + except urllib2.HTTPError, e: + print('HTTPError = ' + str(e.code)) + except urllib2.URLError, e: + print('URLError = ' + str(e.reason)) + except httplib.HTTPException, e: + print('HTTPException') + except Exception: + import traceback + + print('generic exception: ' + traceback.format_exc()) + # else: + # profilePageDetails = profilePage.read() + + print("done sending messages to " + str(counter) + "people") + sys.exit(1) def writeToFileAsJson(myhashTable, filename): # save to file: @@ -236,83 +373,17 @@ def parseGResults(myQS): #note:queryStringForViewMatches already contains the clause: havent contacted before. You dont want to spam #someone you have already contacted and then get blocked if(useBasicSearchPage): - url=br.open(queryStringForBasicSearchPage) + send_from_basic_search_page(br,queryStringForBasicSearchPage) else: - url = br.open(queryStringForViewMatches) - - #url = urllib2.urlopen(queryStringToSearch) - except urllib2.HTTPError, e: - print('HTTPError = ' + str(e.code)) - except urllib2.URLError, e: - print('URLError = ' + str(e.reason)) - except httplib.HTTPException, e: - print('HTTPException') - except Exception: + send_from_view_matches_page(br,queryStringForViewMatches) + except: import traceback print('generic exception: ' + traceback.format_exc()) - else: - content = url.read() - - - print("succesfully logged into pof") - # parse the content into a format that soup understands - soup = bs4.BeautifulSoup(content, "lxml") - # for each of the hyperlinks in the page - counter=0 - for link in soup.find_all('a'): - - #print(link) - classResult = link.get('class') - if (classResult != None): - if ("mi" in classResult): - # if the class exists, get the link, if its not null - linkToNextPage = link.get('href') - if (linkToNextPage != None): - print("\n") - profilePageUrl = stubUrlForPof + linkToNextPage - #print(profilePageUrl) - # once you get the link to the person'as profile, open and go into that page. - - - try: - br.open(profilePageUrl) - #for f in br.forms(): - #print f - - # Select the first form (the first form is the quick message form) - br.select_form(nr=0) - - # User credentials - br.form['message'] = stubMessage - - - # submit the text - br.submit() - counter=counter+1 - print("sent message to "+profilePageUrl) - - except urllib2.HTTPError, e: - print('HTTPError = ' + str(e.code)) - except urllib2.URLError, e: - print('URLError = ' + str(e.reason)) - except httplib.HTTPException, e: - print('HTTPException') - except Exception: - import traceback - print('generic exception: ' + traceback.format_exc()) - #else: - #profilePageDetails = profilePage.read() - - print("done sending messages to "+str(counter) +"people") - sys.exit(1) - - except: - #print('generic exception: ') import traceback print('generic exception: ' + traceback.format_exc()) - #+sys.exc_info()[0]) + From d2908c17160301779219baaedf8bf81a5c4afb68 Mon Sep 17 00:00:00 2001 From: mithunpaul08 Date: Sat, 11 Jun 2022 10:48:53 -0700 Subject: [PATCH 7/9] rewriting for python 3 --- main/src/{scraper.py => cl_alert.py} | 0 main/src/okcscraper.py | 14 ++ main/src/pof_scraper.py | 303 +++++++++++++++++++++++++++ 3 files changed, 317 insertions(+) rename main/src/{scraper.py => cl_alert.py} (100%) create mode 100644 main/src/okcscraper.py create mode 100755 main/src/pof_scraper.py diff --git a/main/src/scraper.py b/main/src/cl_alert.py similarity index 100% rename from main/src/scraper.py rename to main/src/cl_alert.py diff --git a/main/src/okcscraper.py b/main/src/okcscraper.py new file mode 100644 index 0000000..ed72d53 --- /dev/null +++ b/main/src/okcscraper.py @@ -0,0 +1,14 @@ +import requests + +url="https://www.okcupid.com/login" + +payload={ + "username":"mpaul588", + "password":"Alohomora1" +} + +session=requests.session() +response=requests.get(url).content +print(response) + + diff --git a/main/src/pof_scraper.py b/main/src/pof_scraper.py new file mode 100755 index 0000000..6d131a4 --- /dev/null +++ b/main/src/pof_scraper.py @@ -0,0 +1,303 @@ +# #this is no the live version. It contains dummy password. This is for testing only. this is connected to cron job + +#todo +#add ssh + +import requests, bs4, sys, webbrowser, html2text, os , PyPDF2, urllib2, smtplib, re, json +from email.MIMEMultipart import MIMEMultipart +from email.MIMEText import MIMEText +import mechanize +import cookielib +import ssl + + +#uncomment these 2 lines of code if you get the below error. Some unicode encoding stuff +#UnicodeEncodeError: 'ascii' codec can't encode character u'\ufeff' in position 0: ordinal not in range(128) +reload(sys) +sys.setdefaultencoding('utf8') + +stubFilename='carIdHashTable.json' +queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=25&agehigh=35&miles=10&contacted=2&cmdSearch=Refine+Matches' +queryStringForBasicSearchPage='https://www.pof.com/basicsearch.aspx' +firstQueryString='http://www.okcupid.com/' +numberOfGoogleResults=1000 +stubMessage='Hey, nice profile. Love your smile. Are you from Tucson originally?' +startValue=1 +stubUrlForPof='http://www.pof.com/' +stubUrlForBasicSearchPage='http://www.pof.com/' +#stubUrlForTucsonCLInnerpages='http://tucson.craigslist.org/' +#stubUrlForPhxCLInnerpages='http://phoenix.craigslist.org/' +username="" +#the values can be manual, automatic, or both +transmission="both" +pwd="" + + + +path = "/home/mithunpaul/allResearch/clscraper/main/src/" +#pathonLaptop +#path = "/home/mithunpaul/allResearch/clscraper/main/src/" + + + + +if(len(sys.argv)>1): + username=sys.argv[1] + pwd = sys.argv[2] + #print("username:"+username) + # print("pwd:" + pwd) + +else: + print("not enough arguments in Command Line. Exiting.") + sys.exit(1) + + +def encodeAndwriteToOutputFile(textToWrite): + target = open(stubFilename+'.txt', 'w+') + target.write(html2text.html2text(textToWrite).encode('utf-8')) + target.close() + + +def writeToOutputFile(textToWrite): + target = open(stubFilename+'.txt', 'w+') + target.write(textToWrite); + target.close() + +def AdduidToHashtable(uniqueId, localhtToCheck): + localhtToCheck[uniqueId] = 1 +# print("length of hashtable inside checkAndadduidToHashtable is:"+`localhtToCheck.__len__()`) + return localhtToCheck + +def readFromJsonToHashtable(filename): + # load from file: + htMyTable={} + with open(filename, 'r') as f: + try: + #print("inside child :length of hashtable that just came in is:"+`carIdHashTable.__len__()`) + #carIdHashTable["test"] = 1 + # print("inside child :length of hashtable that just came in is:"+`carIdHashTable.__len__()`) + htMyTable = json.load(f) + #print("inside child :length of hashtable inside is:"+`htMyTable.__len__()`) + #carIdHashTable=htMyTable + # print("inside child :length of carIdHashTable inside is:"+`carIdHashTable.__len__()`) + # if the file is empty the ValueError will be thrown + except: + carIdHashTable = {} + return htMyTable + +def send_from_basic_search_page(br,queryStringForViewMatches): + already_sent_today={'viewprofile.aspx?profile_id=82509149':1} + try: + url = br.open(queryStringForViewMatches) + except urllib2.HTTPError as e: + print('HTTPError = ' + str(e.code)) + except urllib2.URLError as e: + print('URLError = ' + str(e.reason)) + except httplib.HTTPException as e: + print('HTTPException') + except Exception: + import traceback + print('generic exception: ' + traceback.format_exc()) + else: + content = url.read() + + print("succesfully logged into pof") + # parse the content into a format that soup understands + soup = bs4.BeautifulSoup(content, "lxml") + # for each of the hyperlinks in the page + counter = 0 + for link in soup.find_all('a'): + # print(link) + classResult = link.get('class') + if (classResult != None): + # if the class exists, get the link, if its not null + linkToNextPage = link.get('href') + if (linkToNextPage != None): + #check if this hyperlink has a profile id + if("profile_id" in linkToNextPage): + #profile_id=74824023 is my own id + if not ("profile_id=74824023" in linkToNextPage): + profilePageUrl = stubUrlForBasicSearchPage + linkToNextPage + # print(profilePageUrl) + # once you get the link to the person'as profile, open and go into that page. + else: + continue + else: + continue + + try: + br.open(profilePageUrl) + # for f in br.forms(): + # print f + + # Select the first form (the first form is the quick message form) + br.select_form(nr=0) + + # User credentials + br.form['message'] = stubMessage + + # submit the text + if not(linkToNextPage in already_sent_today.keys()): + br.submit() + already_sent_today[linkToNextPage]=1 + else: + continue + + counter = counter + 1 + print("sent message to " + profilePageUrl) + + except Exception: + import traceback + + print('generic exception: ' + traceback.format_exc()) + # else: + # profilePageDetails = profilePage.read() + + print("done sending messages to " + str(counter) + "people") + sys.exit(1) + + +def send_from_view_matches_page(br,queryStringForViewMatches): + try: + url = br.open(queryStringForViewMatches) + except urllib2.HTTPError as e: + print('HTTPError = ' + str(e.code)) + except urllib2.URLError as e: + print('URLError = ' + str(e.reason)) + except httplib.HTTPException as e: + print('HTTPException') + except Exception: + import traceback + print('generic exception: ' + traceback.format_exc()) + else: + content = url.read() + + print("succesfully logged into pof") + # parse the content into a format that soup understands + soup = bs4.BeautifulSoup(content, "lxml") + # for each of the hyperlinks in the page + counter = 0 + for link in soup.find_all('a'): + # print(link) + classResult = link.get('class') + if (classResult != None): + if ("mi" in classResult): + # if the class exists, get the link, if its not null + linkToNextPage = link.get('href') + if (linkToNextPage != None): + print("\n") + profilePageUrl = stubUrlForPof + linkToNextPage + # print(profilePageUrl) + # once you get the link to the person'as profile, open and go into that page. + + try: + br.open(profilePageUrl) + # for f in br.forms(): + # print f + + # Select the first form (the first form is the quick message form) + br.select_form(nr=0) + + # User credentials + br.form['message'] = stubMessage + + # submit the text + # br.submit() + counter = counter + 1 + print("sent message to " + profilePageUrl) + + except urllib2.HTTPError as e: + print('HTTPError = ' + str(e.code)) + except urllib2.URLError as e: + print('URLError = ' + str(e.reason)) + except httplib.HTTPException as e: + print('HTTPException') + except Exception: + import traceback + + print('generic exception: ' + traceback.format_exc()) + # else: + # profilePageDetails = profilePage.read() + + print("done sending messages to " + str(counter) + "people") + sys.exit(1) + +def writeToFileAsJson(myhashTable, filename): + # save to file: + with open(filename, 'w+') as f: + json.dump(myhashTable, f) + f.close() + +def parseGResults(myQS): + try: + #code from http://stackoverflow.com/questions/20039643/how-to-scrape-a-website-that-requires-login-first-with-python + # Browser + br = mechanize.Browser() + + # Cookie Jar + cj = cookielib.LWPCookieJar() + br.set_cookiejar(cj) + + # Browser options + br.set_handle_equiv(True) + br.set_handle_gzip(True) + br.set_handle_redirect(True) + br.set_handle_referer(True) + br.set_handle_robots(False) + br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) + + br.addheaders = [('User-agent', 'Chrome')] + + try: + _create_unverified_https_context = ssl._create_unverified_context + except AttributeError: + # Legacy Python that doesn't verify HTTPS certificates by default + pass + else: + # Handle target environment that doesn't support HTTPS verification + ssl._create_default_https_context = _create_unverified_https_context + + # The site we will navigate into, handling it's session + br.open(myQS) + + # View available forms + # for f in br.forms(): + # print f + + # Select the second (index one) form (the first form is a search query box) + br.select_form(nr=0) + + # User credentials + br.form['username'] = username + br.form['password'] = pwd + + # Login + br.submit() + + try: + #note:queryStringForViewMatches already contains the clause: havent contacted before. You dont want to spam + #someone you have already contacted and then get blocked + if(useBasicSearchPage): + send_from_basic_search_page(br,queryStringForBasicSearchPage) + else: + send_from_view_matches_page(br,queryStringForViewMatches) + except: + import traceback + print('generic exception: ' + traceback.format_exc()) + + except: + import traceback + print('generic exception: ' + traceback.format_exc()) + + + + +cwd = os.getcwd() +print("current directory is:"+cwd) +# Now change the directory +if(isRunningOnServer): + os.chdir( path ) + + +parseGResults(firstQueryString) + From e269a20e67d06629578e073cbfa784f90b435eb8 Mon Sep 17 00:00:00 2001 From: mithunpaul08 Date: Sun, 12 Jun 2022 13:29:20 -0700 Subject: [PATCH 8/9] managed to reach till getting login link from home page of pof --- main/src/pof_scraper.py | 42 ++++++++++++++++++++++------------------- pipInstalls.sh | 2 +- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/main/src/pof_scraper.py b/main/src/pof_scraper.py index 6d131a4..bc74dc1 100755 --- a/main/src/pof_scraper.py +++ b/main/src/pof_scraper.py @@ -3,23 +3,21 @@ #todo #add ssh -import requests, bs4, sys, webbrowser, html2text, os , PyPDF2, urllib2, smtplib, re, json -from email.MIMEMultipart import MIMEMultipart -from email.MIMEText import MIMEText +import bs4, sys, html2text, os , json import mechanize -import cookielib import ssl +from http.cookiejar import CookieJar #uncomment these 2 lines of code if you get the below error. Some unicode encoding stuff -#UnicodeEncodeError: 'ascii' codec can't encode character u'\ufeff' in position 0: ordinal not in range(128) -reload(sys) -sys.setdefaultencoding('utf8') +#UnicodeEncodeError: 'ascii' codec can't encode character u'|ufeff' in position 0: ordinal not in range(128) +#reload(sys) +#sys.setdefaultencoding('utf8') stubFilename='carIdHashTable.json' queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=25&agehigh=35&miles=10&contacted=2&cmdSearch=Refine+Matches' queryStringForBasicSearchPage='https://www.pof.com/basicsearch.aspx' -firstQueryString='http://www.okcupid.com/' +firstQueryString='https://www.pof.com/' numberOfGoogleResults=1000 stubMessage='Hey, nice profile. Love your smile. Are you from Tucson originally?' startValue=1 @@ -179,9 +177,9 @@ def send_from_view_matches_page(br,queryStringForViewMatches): counter = 0 for link in soup.find_all('a'): # print(link) - classResult = link.get('class') - if (classResult != None): - if ("mi" in classResult): + classResult = link.text.lower() + if (classResult != ""): + if ("sign" in classResult): # if the class exists, get the link, if its not null linkToNextPage = link.get('href') if (linkToNextPage != None): @@ -206,7 +204,7 @@ def send_from_view_matches_page(br,queryStringForViewMatches): counter = counter + 1 print("sent message to " + profilePageUrl) - except urllib2.HTTPError as e: + except urllib.HTTPError as e: print('HTTPError = ' + str(e.code)) except urllib2.URLError as e: print('URLError = ' + str(e.reason)) @@ -235,7 +233,7 @@ def parseGResults(myQS): br = mechanize.Browser() # Cookie Jar - cj = cookielib.LWPCookieJar() + cj = CookieJar() br.set_cookiejar(cj) # Browser options @@ -258,11 +256,14 @@ def parseGResults(myQS): ssl._create_default_https_context = _create_unverified_https_context # The site we will navigate into, handling it's session - br.open(myQS) + response=br.open(myQS) + print(response.read()) + + send_from_view_matches_page(br, firstQueryString) # View available forms - # for f in br.forms(): - # print f + for f in br.forms(): + print(f) # Select the second (index one) form (the first form is a search query box) br.select_form(nr=0) @@ -295,9 +296,12 @@ def parseGResults(myQS): cwd = os.getcwd() print("current directory is:"+cwd) # Now change the directory -if(isRunningOnServer): - os.chdir( path ) - +# if(isRunningOnServer): +# os.chdir( path ) +# import requests +# #print(requests.get("https://www.pof.com/").content) +# import sys +# sys.exit() parseGResults(firstQueryString) diff --git a/pipInstalls.sh b/pipInstalls.sh index 77a3cc5..f4d6da3 100755 --- a/pipInstalls.sh +++ b/pipInstalls.sh @@ -5,4 +5,4 @@ pip install html2text pip install mechanize pip install cookiejar pip install lxml - +pip install urllib3 From e53f68dd347aeef5b98ce6831ce59d801e9375cb Mon Sep 17 00:00:00 2001 From: mithunpaul08 Date: Sun, 12 Jun 2022 16:27:30 -0700 Subject: [PATCH 9/9] still not able to find the form from the/login page. might have as well gone just directly to pof.com/login. anyway todo: why is my chrome inspector sayinge err_blocked_by_client. if there is no way to inspect the actual form name, i think it is time ot switch to burp suite --- main/src/pof_scraper.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/main/src/pof_scraper.py b/main/src/pof_scraper.py index bc74dc1..6fff8eb 100755 --- a/main/src/pof_scraper.py +++ b/main/src/pof_scraper.py @@ -3,7 +3,7 @@ #todo #add ssh -import bs4, sys, html2text, os , json +import bs4, sys, html2text, os , json, urllib import mechanize import ssl from http.cookiejar import CookieJar @@ -155,7 +155,10 @@ def send_from_basic_search_page(br,queryStringForViewMatches): sys.exit(1) -def send_from_view_matches_page(br,queryStringForViewMatches): +#from the landing page e.g. www.pof.com find the link to login page. Have to do it this +#way because many sites have no form in landpage\login +# #todo: make this a general function that will work for other sites also +def find_login_page(br, queryStringForViewMatches): try: url = br.open(queryStringForViewMatches) except urllib2.HTTPError as e: @@ -184,14 +187,14 @@ def send_from_view_matches_page(br,queryStringForViewMatches): linkToNextPage = link.get('href') if (linkToNextPage != None): print("\n") - profilePageUrl = stubUrlForPof + linkToNextPage + loginPageUrl = stubUrlForPof + linkToNextPage # print(profilePageUrl) # once you get the link to the person'as profile, open and go into that page. try: - br.open(profilePageUrl) - # for f in br.forms(): - # print f + login_page_content = bs4.BeautifulSoup(br.open(loginPageUrl), "lxml") + for form in login_page_content.find_all('form'): + print(form) # Select the first form (the first form is the quick message form) br.select_form(nr=0) @@ -204,13 +207,14 @@ def send_from_view_matches_page(br,queryStringForViewMatches): counter = counter + 1 print("sent message to " + profilePageUrl) - except urllib.HTTPError as e: - print('HTTPError = ' + str(e.code)) - except urllib2.URLError as e: - print('URLError = ' + str(e.reason)) - except httplib.HTTPException as e: - print('HTTPException') - except Exception: + # except urllib.HTTPError as e: + # print('HTTPError = ' + str(e.code)) + # except urllib.URLError as e: + # print('URLError = ' + str(e.reason)) + # except httplib.HTTPException as e: + # print('HTTPException') + except Exception as e: + print(e) import traceback print('generic exception: ' + traceback.format_exc()) @@ -259,7 +263,7 @@ def parseGResults(myQS): response=br.open(myQS) print(response.read()) - send_from_view_matches_page(br, firstQueryString) + find_login_page(br, firstQueryString) # View available forms for f in br.forms(): @@ -281,7 +285,7 @@ def parseGResults(myQS): if(useBasicSearchPage): send_from_basic_search_page(br,queryStringForBasicSearchPage) else: - send_from_view_matches_page(br,queryStringForViewMatches) + find_login_page(br, queryStringForViewMatches) except: import traceback print('generic exception: ' + traceback.format_exc())