-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindexer.py
More file actions
235 lines (172 loc) · 8.24 KB
/
indexer.py
File metadata and controls
235 lines (172 loc) · 8.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
import re
import json
import zlib
from datetime import datetime
from typing import List, Dict, Any, Tuple
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
snow_stemmer = SnowballStemmer(language='english')
def get_lexicon() -> Dict[str, List[int]]:
"""returns previous lexicon if exists otherwise initializes lexicon"""
lexicon = None
# if lexicon file is present we load it to memory
if os.path.isfile('lexicon.txt'):
with open('lexicon.txt', "r") as prev_lexicon:
lexicon = json.load(prev_lexicon)
else:
lexicon = {"word_count": [0, 0]}
return lexicon
def get_document_index() -> Any:
"""load the previous document index if the file exists and load the data"""
document_index = {}
if os.path.isfile('./document_index.txt'):
with open('./document_index.txt') as doc_idx:
document_index = json.load(doc_idx)
return document_index
def get_forward_barrels() -> List:
"""creates forward barrels files and return list of pointers"""
# create a list of forward barrels and we use 300 barrels
forward_barrels = []
for barrel_count in range(1, 301):
forward_barrels.append(
open('./ForwardBarrels/forward_barrel_{}.txt'.format(barrel_count), 'w'))
return forward_barrels
def get_forward_dicts() -> List[Dict]:
"""initializes forward dictionaries"""
# created a list of 300 forward dictionaries
forward_dicts = []
for barrelCount in range(1, 301):
forward_dicts.append({})
return forward_dicts
def write_forward_barrels(forward_dicts: List, forward_barrels: List) -> None:
"""writes forward dictionaries to forward barrel files"""
id = 0
while id < 300: # here 300 is barrel count
# write content of forward dictionary to corresponding forward barrels
for object_ in forward_dicts[id].items():
forward_barrels[id].write(json.dumps(object_))
forward_barrels[id].write("\n")
id += 1
def parse_content(content: Any) -> List:
"""split content, lowercase it and remove stop words and do stemming"""
content = (re.sub('[^a-zA-Z]', ' ',
content)).lower().split()
stemmed_words = ([snow_stemmer.stem(word)
for word in content if not word in stop_words])
return stemmed_words
def process_article_title(stemmed_title: Any, forward_dicts: List[Dict], lexicon: Dict[str, List[int]], hashed_id: int, word_count: int) -> int:
"""reads words in title and updates forwards dictionaries and lexicon"""
position = 1 # position of word in the title
# store the title words in the hit list
for word in stemmed_title:
# add word to lexicon if not in lexicon
if word not in lexicon:
lexicon[word] = [word_count, 0]
word_count += 1
# through word_id calculate which barrel it belongs to and then add hitlist for title
barrel_location = int(lexicon[word][0] / 533)
if (hashed_id, lexicon[word][0]) not in forward_dicts[barrel_location]:
# here hitlist consist of two sub lists, first list for title and second for content
# in title hitlist, first element is always 1 and second element is hit count
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])] = []
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])].insert(0, [1, 1])
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])].insert(1, [0, 0])
else:
# if hit list is present we just increase hit count for title hits
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])][0][1] += 1
position += 1
return word_count
def process_article_content(stemmed_words: Any, forward_dicts: List[Dict], lexicon: Dict[str, List[int]], hashed_id: int, word_count: int) -> int:
"""reads words in content and updates forwards dictionaries and lexicon"""
position = 1 # position of word in the document
for word in stemmed_words:
if word not in lexicon:
lexicon[word] = [word_count, 0]
word_count += 1
barrel_location = int(lexicon[word][0] / 533)
if (hashed_id, lexicon[word][0]) not in forward_dicts[barrel_location]:
# in content hitlist, first element is always 0 and second element is hit count
# and then hit position are appended
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])] = []
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])].insert(0, [1, 0])
forward_dicts[barrel_location][(hashed_id, lexicon[word][0])].insert(
1, [0, 1, position])
else:
# if hit list is present we just increase hit count for content hits
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])][1][1] += 1
forward_dicts[barrel_location][(
hashed_id, lexicon[word][0])][1].append(position)
position += 1
return word_count
def process_loaded_data(loaded_data: Any, forward_dicts: List[Dict], lexicon: Dict[str, List[int]], document_index: Dict, doc_count: int, word_count: int) -> Tuple[int, int]:
"""Parses loaded data and adds to forward dictionaries"""
# read articles in loaded datas
for article in loaded_data:
# hash the doc id
doc_id = bytes(article['id'], 'utf-8')
hashed_id = zlib.crc32(doc_id)
# If the article is already indexed then continue else add doc id to document index
if str(hashed_id) in document_index:
continue
else:
document_index[str(hashed_id)] = article['url']
doc_count += 1
# parse the article's content
stemmed_words = parse_content(article['content'])
# parse the article's title
stemmed_title = parse_content(article['title'])
word_count = process_article_title(stemmed_title, forward_dicts,
lexicon, hashed_id, word_count)
word_count = process_article_content(
stemmed_words, forward_dicts, lexicon, hashed_id, word_count)
return doc_count, word_count
def generate_forward_index(path_to_data: str) -> List:
"""This parses json files and creates lexicon and forward index"""
start = datetime.now()
doc_count = 0
lexicon = get_lexicon()
word_count = lexicon["word_count"][0]
try:
# check the directory for files of json format
file_names = [pos_json for pos_json in os.listdir(
path_to_data) if pos_json.endswith('.json')]
# create a temporary document index to store record of documents being indexed
document_index = get_document_index()
forward_barrels = get_forward_barrels()
for file_name in file_names:
forward_dicts = get_forward_dicts()
# open file and load data to be processed
with open("{}/{}".format(path_to_data, file_name)) as f:
loaded_data = json.load(f)
doc_count, word_count = process_loaded_data(loaded_data, forward_dicts,
lexicon, document_index, doc_count, word_count)
write_forward_barrels(forward_dicts, forward_barrels)
except Exception as error:
print(error)
# dump lexicon program which updates previous lexicon to create new lexicon
lexicon["word_count"][0] = word_count
with open('lexicon.txt', "w") as new_lexicon:
new_lexicon.write(json.dumps(lexicon))
# document index is written to document index file
with open('./document_index.txt', 'w') as new_document_index:
new_document_index.write(json.dumps(document_index))
end = datetime.now()
time_taken = str(end - start)
print("The time of execution to create forward index and lexicon is:", time_taken)
print('doc_count = ', doc_count)
print('word_count = ', word_count)
if doc_count: # if it is more than 0
return [1, doc_count, time_taken]
else:
return [0, doc_count, time_taken]