-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtaggedtext.py
More file actions
executable file
·81 lines (67 loc) · 2.53 KB
/
taggedtext.py
File metadata and controls
executable file
·81 lines (67 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/python
#based on code from https://joshuanewlan.com/spacy-and-markovify
import sys, re, markovify, spacy, textacy, random
from unidecode import unidecode
from importlib import reload
#import textacy.preprocess, textacy.preprocessing, textacy.preprocessing.resources
#textacy.constants = textacy.preprocessing.resources
#reload(textacy)
#reload(textacy.preprocess)
nlp = spacy.load('en')
class TaggedText(markovify.Text):
def sentence_split(self, corpus):
"""
Splits full-text string into a list of sentences.
"""
sentence_list = []
#pat = re.compile(' \*[A-Z]*\*')
for doc in corpus:
#for sent in list(doc.sents):
#sentence_list += re.sub(pat,'', textacy.preprocess.preprocess_text(sent.text,no_urls=True,no_emails=True, no_phone_numbers=True))
sentence_list += list(doc.sents)
return sentence_list
#split_pat = re.compile(r'\s+')
def word_split(self, sentence):
"""
Splits a sentence into a list of words.
"""
# this is done to use comparisons like in make_sentence_from*
if type(sentence) == str:
corpus = textacy.Corpus(lang='en')
corpus.add_text(text=sentence)
sentence = corpus.docs[0]
return ["::".join((word.orth_,word.pos_)) for word in sentence]
def word_join(self, words):
sentence = " ".join(word.split("::")[0] for word in words)
return sentence
def test_sentence_input(self, sentence):
"""
A basic sentence filter. This one rejects sentences that contain
the type of punctuation that would look strange on its own
in a randomly-generated sentence.
"""
sentence = sentence.text
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str":
decoded = sentence
else:
decoded = unidecode(sentence)
# Sentence shouldn't contain problematic characters
if re.search(reject_pat, decoded): return False
return True
def generate_corpus(self, corpus):
"""
Given a text string, returns a list of lists; that is, a list of
"sentences," each of which is a list of words. Before splitting into
words, the sentences are filtered through `self.test_sentence_input`
"""
sentences = self.sentence_split(corpus)
passing = filter(self.test_sentence_input, sentences)
runs = map(self.word_split, sentences)
#print(runs[0])
return runs
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: %s corpus' % sys.argv[0])
sys.exit(2)