-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcumulative_frequency.py
More file actions
70 lines (54 loc) · 2.16 KB
/
cumulative_frequency.py
File metadata and controls
70 lines (54 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from typing import List
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from collections import Counter
import random
from src import config
from src.utils import human_format
def make_cumulative_frequency_fig(words: List[str],
corpus_partitions: List[List[str]],
) -> plt.Figure:
"""
Returns fig showing time course of cumulative frequency of "words" across "corpus_partitions"
"""
palette = iter(sns.color_palette("hls", len(words)))
# count
num_parts = len(corpus_partitions)
part2w2f = {n: Counter(corpus_partitions[n]) for n in range(num_parts)}
# collect cum. frequencies for each word
x = np.arange(num_parts)
xys = []
for w in words:
frequencies = [part2w2f[n][w] for n in range(num_parts)]
y = np.cumsum(frequencies)
print(w)
print(y)
# get last frequency for figure annotation
last_y, last_x = y[-1], x[-1]
xys.append((x, y, last_x, last_y, w))
# fig
res, ax = plt.subplots(figsize=config.Fig.fig_size, dpi=config.Fig.dpi)
ax.set_xlabel('Corpus Location', fontsize=config.Fig.ax_label_fontsize)
ax.set_ylabel('Cumulative Frequency', fontsize=config.Fig.ax_label_fontsize)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.set_major_formatter(FuncFormatter(human_format))
# plot
y_thr = np.max([xy[3] for xy in xys]) / 10 # threshold is at third from max
for (x, y, last_x, last_y, w) in xys:
ax.plot(x, y, '-', linewidth=1.0, c=next(palette))
if last_y > y_thr:
plt.annotate(w, xy=(last_x, last_y),
xytext=(0, 0), textcoords='offset points',
va='center', fontsize=config.Fig.legend_fontsize, bbox=dict(boxstyle='round', fc='w'))
return res
NUM_PARTS = 256
PART_SIZE = 100
NUM_WORDS = 50
words = [f'w{i}' for i in range(NUM_WORDS)]
parts = [random.choices(random.choices(words, k=10), k=PART_SIZE)
for _ in range(NUM_PARTS)]
fig = make_cumulative_frequency_fig(words[:5], parts)
fig.show()