-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword_level_annotations.vim
More file actions
102 lines (74 loc) · 3.45 KB
/
word_level_annotations.vim
File metadata and controls
102 lines (74 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
map ce :py3 remove_tag_utf8()<CR>
map ,2 :py3 unify_tags()<CR>
vmap cl c[@<c-r>"#GEO\|\|]<ESC>
vmap cd c[@<c-r>"#DATE\|\|]<ESC>
vmap cg c[@<c-r>"#ORG\|\|]<ESC>
vmap cn c[@<c-r>"#PERSON\|\|]<ESC>
vmap cr c[@<c-r>"#CURRENCY\|\|]<ESC>
vmap ,1 c[@<c-r>"#REPLACE_ME\|\|]<ESC>
py3 << EOF
def unify_tags():
import vim
import re
content_regex = "([^#]+)"
tag_regex = r"([^|]+\?{0,1})"
norm_regex = "([^|]*)"
single_word_regex_string = r"\[@" + content_regex + "#" + tag_regex + r"\|" + norm_regex + r"\|\]"
single_word_regex = re.compile(single_word_regex_string)
multi_word_regex_string = single_word_regex_string + r"([,\.]{0,1}\s*)" + single_word_regex_string
multi_word_regex = re.compile(multi_word_regex_string)
(row, col) = vim.current.window.cursor
m = multi_word_regex.search(vim.current.buffer[row-1])
content_left = m.group(1)
tag_left = m.group(2)
norm_left = m.group(3)
space_between = m.group(4)
content_right = m.group(5)
tag_right = m.group(6)
norm_right = m.group(7)
final_norm = norm_left + norm_right
value_before = m.group(0)
value_after = "[@" + content_left + space_between + content_right + "#" + tag_left + "|" + final_norm + "|]"
vim.current.buffer[row-1] = vim.current.buffer[row-1].replace(value_before, value_after)
# Go back to where the cursor was
vim.command(":cal cursor(" + str(row) + ", " + str(col) + ")")
# https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/
# a byte of the form 110xxxxx says the first five bits of a Unicode character are
# stored at the end of this byte, and the rest of the bits are coming in the next byte.
# A byte of the form 1110xxxx contains four bits of a Unicode character and says that the
# rest of the bits are coming over the next two bytes.
# A byte of the form 11110xxx contains three bits of a Unicode character and says that
# the rest of the bits are coming over the next three bytes.
def remove_tag_utf8():
import vim
# The problem with the .cursor function is that it gives us the location on a byte array line
# and not on a utf-8 line
(row, col) = vim.current.window.cursor
line_byte_array = bytearray(vim.current.buffer[row-1], "utf-8")
i_utf8 = 0
# Convert from i_ascii to i_utf
i_ascii_to_i_utf8 = dict()
for i, cur_col in enumerate(line_byte_array):
bin_byte = bin(cur_col)
i_ascii_to_i_utf8[i] = i_utf8
if cur_col < 128 or str(bin_byte).startswith("0b1110") or str(bin_byte).startswith("0b11110") or str(bin_byte).startswith("0b110"):
i_utf8 += 1
col_ascii = col
col = i_ascii_to_i_utf8[col]
for cur_col in range(col, -1, -1):
C = vim.current.buffer[row-1][cur_col]
if C == "[":
start_index = cur_col
break
for cur_col in range(col, 10000):
if vim.current.buffer[row-1][cur_col] == "#":
end_content = cur_col
break
for cur_col in range(col, 10000):
if vim.current.buffer[row-1][cur_col] == "]":
end_index = cur_col
break
vim.current.buffer[row - 1] = vim.current.buffer[row - 1][0:start_index] + vim.current.buffer[row - 1][start_index + 2:end_content] + vim.current.buffer[row - 1][end_index + 1:]
# Go back to where the cursor was
vim.command(":cal cursor(" + str(row) + ", " + str(col_ascii) + ")")
EOF