General_Python_Anonymous-google-extraction-/crawler.py at main · WatchTree-19/General_Python_Anonymous-google-extraction- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-


from utils import *

def clean_text(str_in):
    import re
    sent_a_clean = re.sub("[^A-Za-z]+", " ", str_in.lower())
    return sent_a_clean

def init():
    #https://pypi.org/project/torpy/
    ##pip install torpy
    from torpy import TorClient
    hostname = 'ifconfig.me'  # It's possible use onion hostname here as well
    with TorClient() as tor:
        # Choose random guard node and create 3-hops circuit
        with tor.create_circuit(3) as circuit:
            # Create tor stream to host
            with circuit.create_stream((hostname, 80)) as stream:
                # Now we can communicate with host
                stream.send(b'GET / HTTP/1.0\r\nHost: %s\r\n\r\n' % hostname.encode())
                recv = stream.recv(1024)
    return 0

def my_scraper(tmp_url_in):
    from bs4 import BeautifulSoup
    import requests
    import re
    import time
    tmp_text = ''
    try:
        content = requests.get(tmp_url_in, timeout=10)
        soup = BeautifulSoup(content.text, 'html.parser')

        tmp_text = soup.findAll('p')

        tmp_text = [word.text for word in tmp_text]
        tmp_text = ' '.join(tmp_text)
        tmp_text = re.sub('\W+', ' ', re.sub('xa0', ' ', tmp_text))
    except:
        print("Connection refused by the server..")
        print("Let me sleep for 5 seconds")
        print("ZZzzzz...")
        time.sleep(5)
        print("Was a nice sleep, now let me continue...")
        pass
    return tmp_text

def fetch_urls(query_tmp, cnt):
    #now lets use the following function that returns
    #URLs from an arbitrary regex crawl form google

    #pip install pyyaml ua-parser user-agents fake-useragent
    import requests
    from fake_useragent import UserAgent
    from bs4 import BeautifulSoup
    import re
    ua = UserAgent()

    query = '+'.join(query_tmp.split())
    google_url = "https://www.google.com/search?q=" + query + "&num=" + str(cnt)
    print (google_url)
    response = requests.get(google_url, {"User-Agent": ua.random})
    soup = BeautifulSoup(response.text, "html.parser")

    result_div = soup.find_all('div', attrs = {'class': 'egMi0 kCrYT'})

    links = []
    for r in result_div:
        # Checks if each element is present, else, raise exception
        try:
            link = r.find('a', href = True)
            # Check to make sure everything is present before appending
            if link != '':# and title != '' and description != '':
                links.append(link['href'])
        # Next loop if one element is not present
        except:
            continue
    to_remove = []
    clean_links = []
    for i, l in enumerate(links):
        clean = re.search('\/url\?q\=(.*)\&sa',l)
        # Anything that doesn't fit the above pattern will be removed
        if clean is None:
            to_remove.append(i)
            continue
        clean_links.append(clean.group(1))

    return clean_links

def write_crawl_results(my_query, the_cnt_in):
    #let use fetch_urls to get URLs then pass to the my_scraper function
    import re
    import pandas as pd

    tmp_pd = pd.DataFrame()
    for q_blah in my_query:
        init()
        the_urls_list = fetch_urls(q_blah, the_cnt_in)

        for word in the_urls_list:
            tmp_txt = my_scraper(word)
            body_clean = clean_text(tmp_txt)
            if len(tmp_txt) != 0:
                try:
                    tmp_pd = tmp_pd.append({'body': tmp_txt,
                                            'label': re.sub(' ', '_', q_blah),
                                            'body_clean': body_clean,
                                            }, ignore_index=True)
                    print (word)
                except:
                    pass
                tmp_pd["body_sw"] = tmp_pd.body_clean.apply(rem_sw)
                tmp_pd["body_sw_stem"] = tmp_pd.body_sw.apply(my_stem)
    return tmp_pd