1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
| from typing import Iterator, Sequence, Text, Tuple, Union
from itertools import groupby
import numpy as np
from scipy.sparse import spmatrix
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
NDArray = Union[np.ndarray, spmatrix]
TokenSeq = Sequence[Text]
PosSeq = Sequence[Text]
def read_ptbtagged(ptbtagged_path: str) -> Iterator[Tuple[TokenSeq, PosSeq]]:
"""Reads sentences from a Penn TreeBank .tagged file.
Each sentence is a sequence of tokens and part-of-speech tags.
Penn TreeBank .tagged files contain one token per line, with an empty line
marking the end of each sentence. Each line is composed of a token, a tab
character, and a part-of-speech tag. Here is an example:
What WP
's VBZ
next JJ
? .
Slides NNS
to TO
illustrate VB
Shostakovich NNP
quartets NNS
? .
:param ptbtagged_path: The path of a Penn TreeBank .tagged file, formatted
as above.
:return: An iterator over sentences, where each sentence is a tuple of
a sequence of tokens and a corresponding sequence of part-of-speech tags.
"""
with open(ptbtagged_path, 'r') as f:
return [([word.strip().split()[0].strip() for word in group], [word.strip().split()[1].strip() for word in group]) for group in [list(j) for i, j in groupby(f.readlines(), key=lambda d: d.strip() == "") if not i]]
class Classifier(object):
def __init__(self):
"""Initializes the classifier."""
self.token_vectorizer = DictVectorizer(sparse=False)
self.pos_labeler = preprocessing.LabelEncoder()
# self.clf = LogisticRegression(n_jobs=-1, random_state=42, penalty='elasticnet', solver='saga', l1_ratio=0.2)
# self.clf = LogisticRegression()
self.clf = MLPClassifier(random_state=42, max_iter=1000, hidden_layer_sizes=(100,), learning_rate='adaptive')
# self.stemmer = nltk.stem.SnowballStemmer('english')
def suffix_pos_feature(self, token, suffix):
return 1 if token.lower().endswith(suffix) else 0
def get_features(self, token, prev_token, next_token, prev_pos, prev2_pos):
return {
# 'token': self.stemmer.stem(token),
'token': token,
# 'token-1': prev_token,
# 'token+1': next_token,
'pos-1': prev_pos,
# 'pos-2': prev2_pos,
'first-capital': 1 if token[0].isupper() else 0,
'end-ing': self.suffix_pos_feature(token, 'ing'),
'end-ion': self.suffix_pos_feature(token, 'ion'),
'end-able': self.suffix_pos_feature(token, 'able'),
'end-ance': self.suffix_pos_feature(token, 'ance'),
'end-tion': self.suffix_pos_feature(token, 'tion'),
'end-sion': self.suffix_pos_feature(token, 'sion'),
'end-ment': self.suffix_pos_feature(token, 'ment'),
'end-ure': self.suffix_pos_feature(token, 'ure'),
'end-ity': self.suffix_pos_feature(token, 'ity'),
'end-age': self.suffix_pos_feature(token, 'age'),
'end-ant': self.suffix_pos_feature(token, 'ant'),
'end-ent': self.suffix_pos_feature(token, 'ent'),
'end-ive': self.suffix_pos_feature(token, 'ive'),
'end-cial': self.suffix_pos_feature(token, 'cial'),
'end-tial': self.suffix_pos_feature(token, 'tial'),
'end-ous': self.suffix_pos_feature(token, 'ous'),
'end-ic': self.suffix_pos_feature(token, 'ic'),
'end-en': self.suffix_pos_feature(token, 'en'),
'end-olve': self.suffix_pos_feature(token, 'olve'),
'end-ide': self.suffix_pos_feature(token, 'ide'),
'end-ise': self.suffix_pos_feature(token, 'ise'),
'end-acy': self.suffix_pos_feature(token, 'acy'),
'end-age': self.suffix_pos_feature(token, 'age'),
'end-ence': self.suffix_pos_feature(token, 'ence'),
'end-hood': self.suffix_pos_feature(token, 'hood'),
'end-ism': self.suffix_pos_feature(token, 'ism'),
'end-ful': self.suffix_pos_feature(token, 'ful'),
'end-ly': self.suffix_pos_feature(token, 'ly'),
'end-ish': self.suffix_pos_feature(token, 'ish'),
'end-like': self.suffix_pos_feature(token, 'like'),
'end-ed': self.suffix_pos_feature(token, 'ed'),
'end-s': self.suffix_pos_feature(token, 's'),
'end-ible': self.suffix_pos_feature(token, 'ible'),
}
def train(self, tagged_sentences: Iterator[Tuple[TokenSeq, PosSeq]]) -> Tuple[NDArray, NDArray]:
"""Trains the classifier on the part-of-speech tagged sentences,
and returns the feature matrix and label vector on which it was trained.
The feature matrix should have one row per training token. The number
of columns is up to the implementation, but there must at least be 1
feature for each token, named "token=T", where "T" is the token string,
and one feature for the part-of-speech tag of the preceding token,
named "pos-1=P", where "P" is the part-of-speech tag string, or "<s>" if
the token was the first in the sentence. For example, if the input is:
What WP
's VBZ
next JJ
? .
Then the first row in the feature matrix should have features for
"token=What" and "pos-1=<s>", the second row in the feature matrix
should have features for "token='s" and "pos-1=WP", etc. The alignment
between these feature names and the integer columns of the feature
matrix is given by the `feature_index` method below.
The label vector should have one entry per training token, and each
entry should be an integer. The alignment between part-of-speech tag
strings and the integers in the label vector is given by the
`label_index` method below.
:param tagged_sentences: An iterator over sentences, where each sentence
is a tuple of a sequence of tokens and a corresponding sequence of
part-of-speech tags.
:return: A tuple of (feature-matrix, label-vector).
"""
features = []
labels = []
for tokens, poses in tagged_sentences:
features += [self.get_features(token, tokens[i-1] if i > 0 else "", tokens[i+1] if i < len(tokens)-1 else "", poses[i-1] if i > 0 else "<s>", poses[i-2] if i > 1 else "<s>") for i, token in enumerate(tokens)]
labels += poses
features, labels = self.token_vectorizer.fit_transform(features), self.pos_labeler.fit_transform(labels)
self.clf.fit(
features,
labels
)
return features, labels
def feature_index(self, feature: Text) -> int:
"""Returns the column index corresponding to the given named feature.
The `train` method should always be called before this method is called.
:param feature: The string name of a feature.
:return: The column index of the feature in the feature matrix returned
by the `train` method.
"""
return np.where(self.token_vectorizer.get_feature_names_out() == feature)[0].squeeze()
def label_index(self, label: Text) -> int:
"""Returns the integer corresponding to the given part-of-speech tag
The `train` method should always be called before this method is called.
:param label: The part-of-speech tag string.
:return: The integer for the part-of-speech tag, to be used in the label
vector returned by the `train` method.
"""
return self.pos_labeler.transform([label])[0]
def predict(self, tokens: TokenSeq) -> PosSeq:
"""Predicts part-of-speech tags for the sequence of tokens.
This method delegates to either `predict_greedy` or `predict_viterbi`.
The implementer may decide which one to delegate to.
:param tokens: A sequence of tokens representing a sentence.
:return: A sequence of part-of-speech tags, one for each token.
"""
_, pos_tags = self.predict_greedy(tokens)
# _, _, pos_tags = self.predict_viterbi(tokens)
return pos_tags
def predict_greedy(self, tokens: TokenSeq) -> Tuple[NDArray, PosSeq]:
"""Predicts part-of-speech tags for the sequence of tokens using a
greedy algorithm, and returns the feature matrix and predicted tags.
Each part-of-speech tag is predicted one at a time, and each prediction
is considered a hard decision, that is, when predicting the
part-of-speech tag for token i, the model will assume that its
prediction for token i-1 is correct and unchangeable.
The feature matrix should have one row per input token, and be formatted
in the same way as the feature matrix in `train`.
:param tokens: A sequence of tokens representing a sentence.
:return: The feature matrix and the sequence of predicted part-of-speech
tags (one for each input token).
"""
features = []
poses = []
for i, token in enumerate(tokens):
features.append(self.token_vectorizer.transform(self.get_features(token, tokens[i-1] if i > 0 else "", tokens[i+1] if i < len(tokens)-1 else "", poses[i-1] if i > 0 else "<s>", poses[i-2] if i > 1 else "<s>")))
poses.append(self.pos_labeler.inverse_transform(self.clf.predict(features[-1]))[0])
return np.array(features).squeeze(), np.array(poses).squeeze()
|