Search-Engine/indexer.py at master · grim-shaw/Search-Engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
import re
import json
import zlib
from datetime import datetime
from typing import List, Dict, Any, Tuple

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
snow_stemmer = SnowballStemmer(language='english')


def get_lexicon() -> Dict[str, List[int]]:
    """returns previous lexicon if exists otherwise initializes lexicon"""

    lexicon = None
    # if lexicon file is present we load it to memory
    if os.path.isfile('lexicon.txt'):
        with open('lexicon.txt', "r") as prev_lexicon:
            lexicon = json.load(prev_lexicon)
    else:
        lexicon = {"word_count": [0, 0]}

    return lexicon


def get_document_index() -> Any:
    """load the previous document index if the file exists and load the data"""

    document_index = {}
    if os.path.isfile('./document_index.txt'):
        with open('./document_index.txt') as doc_idx:
            document_index = json.load(doc_idx)
    return document_index


def get_forward_barrels() -> List:
    """creates forward barrels files and return list of pointers"""

    # create a list of forward barrels and we use 300 barrels
    forward_barrels = []
    for barrel_count in range(1, 301):
        forward_barrels.append(
            open('./ForwardBarrels/forward_barrel_{}.txt'.format(barrel_count), 'w'))
    return forward_barrels


def get_forward_dicts() -> List[Dict]:
    """initializes forward dictionaries"""

    # created a list of 300 forward dictionaries
    forward_dicts = []
    for barrelCount in range(1, 301):
        forward_dicts.append({})
    return forward_dicts


def write_forward_barrels(forward_dicts: List, forward_barrels: List) -> None:
    """writes forward dictionaries to forward barrel files"""

    id = 0
    while id < 300:  # here 300 is barrel count
        # write content of forward dictionary to corresponding forward barrels
        for object_ in forward_dicts[id].items():
            forward_barrels[id].write(json.dumps(object_))
            forward_barrels[id].write("\n")
        id += 1


def parse_content(content: Any) -> List:
    """split content, lowercase it and remove stop words and do stemming"""

    content = (re.sub('[^a-zA-Z]', ' ',
                      content)).lower().split()
    stemmed_words = ([snow_stemmer.stem(word)
                      for word in content if not word in stop_words])
    return stemmed_words


def process_article_title(stemmed_title: Any, forward_dicts: List[Dict], lexicon: Dict[str, List[int]], hashed_id: int, word_count: int) -> int:
    """reads words in title and updates forwards dictionaries and lexicon"""

    position = 1  # position of word in the title

    # store the title words in the hit list
    for word in stemmed_title:

        # add word to lexicon if not in lexicon
        if word not in lexicon:
            lexicon[word] = [word_count, 0]
            word_count += 1

        # through word_id calculate which barrel it belongs to and then add hitlist for title
        barrel_location = int(lexicon[word][0] / 533)

        if (hashed_id, lexicon[word][0]) not in forward_dicts[barrel_location]:
            # here hitlist consist of two sub lists, first list for title and second for content
            # in title hitlist, first element is always 1 and second element is hit count
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])] = []
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])].insert(0, [1, 1])
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])].insert(1, [0, 0])
        else:
            # if hit list is present we just increase hit count for title hits
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])][0][1] += 1

        position += 1

    return word_count


def process_article_content(stemmed_words: Any, forward_dicts: List[Dict], lexicon: Dict[str, List[int]], hashed_id: int, word_count: int) -> int:
    """reads words in content and updates forwards dictionaries and lexicon"""

    position = 1  # position of word in the document

    for word in stemmed_words:

        if word not in lexicon:
            lexicon[word] = [word_count, 0]
            word_count += 1

        barrel_location = int(lexicon[word][0] / 533)

        if (hashed_id, lexicon[word][0]) not in forward_dicts[barrel_location]:
            # in content hitlist, first element is always 0 and second element is hit count
            # and then hit position are appended
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])] = []
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])].insert(0, [1, 0])
            forward_dicts[barrel_location][(hashed_id, lexicon[word][0])].insert(
                1, [0, 1, position])
        else:
            # if hit list is present we just increase hit count for content hits
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])][1][1] += 1
            forward_dicts[barrel_location][(
                hashed_id, lexicon[word][0])][1].append(position)

        position += 1

    return word_count


def process_loaded_data(loaded_data: Any, forward_dicts: List[Dict], lexicon: Dict[str, List[int]], document_index: Dict, doc_count: int, word_count: int) -> Tuple[int, int]:
    """Parses loaded data and adds to forward dictionaries"""

    # read articles in loaded datas
    for article in loaded_data:

        # hash the doc id
        doc_id = bytes(article['id'], 'utf-8')
        hashed_id = zlib.crc32(doc_id)

        # If the article is already indexed then continue else add doc id to document index
        if str(hashed_id) in document_index:
            continue
        else:
            document_index[str(hashed_id)] = article['url']
            doc_count += 1

        # parse the article's content
        stemmed_words = parse_content(article['content'])

        # parse the article's title
        stemmed_title = parse_content(article['title'])

        word_count = process_article_title(stemmed_title, forward_dicts,
                                           lexicon, hashed_id, word_count)
        word_count = process_article_content(
            stemmed_words, forward_dicts, lexicon, hashed_id, word_count)

    return doc_count, word_count


def generate_forward_index(path_to_data: str) -> List:
    """This parses json files and creates lexicon and forward index"""

    start = datetime.now()
    doc_count = 0
    lexicon = get_lexicon()
    word_count = lexicon["word_count"][0]

    try:

        # check the directory for files of json format
        file_names = [pos_json for pos_json in os.listdir(
            path_to_data) if pos_json.endswith('.json')]

        # create a temporary document index to store record of documents being indexed
        document_index = get_document_index()
        forward_barrels = get_forward_barrels()

        for file_name in file_names:
            forward_dicts = get_forward_dicts()

            # open file and load data to be processed
            with open("{}/{}".format(path_to_data, file_name)) as f:
                loaded_data = json.load(f)

            doc_count, word_count = process_loaded_data(loaded_data, forward_dicts,
                                                        lexicon, document_index, doc_count, word_count)

            write_forward_barrels(forward_dicts, forward_barrels)

    except Exception as error:
        print(error)

    # dump lexicon program which updates previous lexicon to create new lexicon
    lexicon["word_count"][0] = word_count
    with open('lexicon.txt', "w") as new_lexicon:
        new_lexicon.write(json.dumps(lexicon))

    # document index is written to document index file
    with open('./document_index.txt', 'w') as new_document_index:
        new_document_index.write(json.dumps(document_index))

    end = datetime.now()
    time_taken = str(end - start)
    print("The time of execution to create forward index and lexicon is:", time_taken)
    print('doc_count = ', doc_count)
    print('word_count = ', word_count)

    if doc_count:  # if it is more than 0
        return [1, doc_count, time_taken]
    else:
        return [0, doc_count, time_taken]