-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
117 lines (100 loc) · 3.95 KB
/
crawler.py
File metadata and controls
117 lines (100 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
from utils import *
def clean_text(str_in):
import re
sent_a_clean = re.sub("[^A-Za-z]+", " ", str_in.lower())
return sent_a_clean
def init():
#https://pypi.org/project/torpy/
##pip install torpy
from torpy import TorClient
hostname = 'ifconfig.me' # It's possible use onion hostname here as well
with TorClient() as tor:
# Choose random guard node and create 3-hops circuit
with tor.create_circuit(3) as circuit:
# Create tor stream to host
with circuit.create_stream((hostname, 80)) as stream:
# Now we can communicate with host
stream.send(b'GET / HTTP/1.0\r\nHost: %s\r\n\r\n' % hostname.encode())
recv = stream.recv(1024)
return 0
def my_scraper(tmp_url_in):
from bs4 import BeautifulSoup
import requests
import re
import time
tmp_text = ''
try:
content = requests.get(tmp_url_in, timeout=10)
soup = BeautifulSoup(content.text, 'html.parser')
tmp_text = soup.findAll('p')
tmp_text = [word.text for word in tmp_text]
tmp_text = ' '.join(tmp_text)
tmp_text = re.sub('\W+', ' ', re.sub('xa0', ' ', tmp_text))
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
pass
return tmp_text
def fetch_urls(query_tmp, cnt):
#now lets use the following function that returns
#URLs from an arbitrary regex crawl form google
#pip install pyyaml ua-parser user-agents fake-useragent
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re
ua = UserAgent()
query = '+'.join(query_tmp.split())
google_url = "https://www.google.com/search?q=" + query + "&num=" + str(cnt)
print (google_url)
response = requests.get(google_url, {"User-Agent": ua.random})
soup = BeautifulSoup(response.text, "html.parser")
result_div = soup.find_all('div', attrs = {'class': 'egMi0 kCrYT'})
links = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href = True)
# Check to make sure everything is present before appending
if link != '':# and title != '' and description != '':
links.append(link['href'])
# Next loop if one element is not present
except:
continue
to_remove = []
clean_links = []
for i, l in enumerate(links):
clean = re.search('\/url\?q\=(.*)\&sa',l)
# Anything that doesn't fit the above pattern will be removed
if clean is None:
to_remove.append(i)
continue
clean_links.append(clean.group(1))
return clean_links
def write_crawl_results(my_query, the_cnt_in):
#let use fetch_urls to get URLs then pass to the my_scraper function
import re
import pandas as pd
tmp_pd = pd.DataFrame()
for q_blah in my_query:
init()
the_urls_list = fetch_urls(q_blah, the_cnt_in)
for word in the_urls_list:
tmp_txt = my_scraper(word)
body_clean = clean_text(tmp_txt)
if len(tmp_txt) != 0:
try:
tmp_pd = tmp_pd.append({'body': tmp_txt,
'label': re.sub(' ', '_', q_blah),
'body_clean': body_clean,
}, ignore_index=True)
print (word)
except:
pass
tmp_pd["body_sw"] = tmp_pd.body_clean.apply(rem_sw)
tmp_pd["body_sw_stem"] = tmp_pd.body_sw.apply(my_stem)
return tmp_pd