word2vec_syntax/script_to_model.py at master · Nowow/word2vec_syntax · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 15 04:54:50 2016

@author: robert
"""

import pickle

# use nltk.download() to download stopwords corpus if not yet


# Iterable to be passed to word2vec class as sentences.
# reads sentences one by one from picke dump

class SentIterable(object):

# correct path to parsed corpus dump HERE

    dumpPath = '/run/media/robert/1TB-1/linuxfolder/pythonworks/corporaDump'
   # dumpPath = '/run/media/robert/1TB-1/linuxfolder/pythonworks/miniCorp'
    dumpOpened = open(dumpPath, 'rb')
#    pickleDump = open(dumpPath, 'rb')
    sentCount = 10000000000

# self.sentCount to be inherited (got?) while creating context 86468867
#minicorpus length 3410461
    def __iter__(self):
        try:
            for sent in range(self.sentCount):
                yield pickle.load(self.dumpOpened)
        except EOFError:
            print('Pickler Done')
            self.dumpOpened = open(self.dumpPath, 'rb')


class ContIterable(object):

# correct path to parsed corpus dump HERE

    dumpPath = '/run/media/robert/1TB-1/linuxfolder/pythonworks/contDumpFinal'
  #  dumpPath = '/run/media/robert/1TB-1/linuxfolder/pythonworks/miniCont'
    dumpOpened = open(dumpPath, 'rb')
#    pickleDump = open(dumpPath, 'rb')
    sentCount = 10000000000

# self.sentCount to be inherited (got?) while creating context 98366448
#minicorpus length 3410461
    def __iter__(self):
        try:
            for sent in range(self.sentCount):
                yield pickle.load(self.dumpOpened)
        except EOFError:
            print('Pickler for synt Done')
            self.dumpOpened = open(self.dumpPath, 'rb')


import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)


num_features = 500    # Word vector dimensionality
min_word_count = 100   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 2         # Context window size
downsampling = 1e-3   # Downsample setting for frequent words


#from gensim.models import word2vec_synt
#print("Training model...")
#model = word2vec_synt.Word2Vec(SentIterable(), workers=num_workers, \
#            size=num_features, min_count = min_word_count, \
#            window = context, sample = downsampling, sg = 1, hs = 1, negative = 0,
#            synt_cash = ContIterable())

sentences = ContIterable()
from gensim.models import word2vec_synt
print("Training model...")
model = word2vec_synt.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, sg = 1)