Statistical Natural Language Processing

This course is taught by Peter Jansen.

An advanced introduction to Statistical Natural Language Processing (NLP), including information theory, part-of-speech tagging, parsing, machine translation, machine learning, and information retrieval. Emphasizes implementing foundational algorithms and representations for common NLP tasks from the ground up.

The course had a lot of programming assignments and helped us learn different applications of NLP with python. Some of the programming assignments with implementation is given below:

POS tagging and Named Entity Recognition

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from typing import List, Set, Dict, Tuple, Optional, Text
from operator import itemgetter

# import spacy package.
import spacy


# Please fill in the following varibales. 
Name="Md Rahat-uz- Zaman"
Email="rahatzamancse@email.arizona.edu"
Collaborator=""

# load  the small english core web model from spacy.
nlp = spacy.load("en_core_web_sm")


def tokenize_text(text: Text) -> List[Text]:
    """This functions tokenize a text by iterating 
            over its tokens by using spacy English tokenizer.
        =============
        Params:
            text
        Return:
            A list of tokenized items. ["token1, "token2"...etc]
    """
    return list(map(lambda e: e.text, nlp.tokenizer(text)))
    
    
def recognize_name_entity(text: Text) -> List[Text]:
    """This function recognizes name entities in a text by using Spacy
        English tokenizer.
        #NOTE: Please read about converting span (span1, span2..etc)
                into list of strings ["span1", "span2"...etc].
        =============
        Params:
            text 
        Return:
            A list of tokenized name entities. ex: ['entity1', 'entity2'...etc] 
    """ 
    return list(map(lambda e: e.text, nlp(text).ents))
 

def get_entity_labels(text: Text) -> List[Tuple[Text, Text]]:
    """This function obtains the labels of entities.
        #NOTE: Please read about the types of label methods in spacy. You should 
                return a string label rahter than an integer label. 
        =============
        Params:
            text 
        :Return:
            A list of tuples  of entities and its labels. 
            ex: [("entity1", "label1"), ("entity2", "label2")...etc]
    """
    return list(map(lambda e: (e.text, e.label_), nlp(text).ents))

    
def get_lemmas(text: Text) -> List[Tuple[Text, Text]]:
    """This function finds  lemmas in a text by using Spacy 
        English tokenizer. It must return a token and its lemma.
        #NOTE: Please o read about the types of lemma methods in spacy. Your should  
                return a string lemma rahter than an integer lemma. 
        =============
        :Params:
            text 
        :Return:
            A list of of tuples of tokens and their stem. 
            ex: [('tokens', 'token'), ...etc] 
    """
    return list(map(lambda e: (e.text, e.lemma_), nlp(text)))
   

def get_POS_tags(text: Text) ->  List[Tuple[Text, ...]]:
    """This function obtains  with its the associated POS and tags of each token 
        in a text by using Spacy tags.  It must return the verb itself, 
        part of of speech (POS), and the associated tag. 
        #NOTE: Read Spacy POS and assciated tags. 
    =============
        Params:
            text
        Return:
            A list of tuples of strings. ex: [('get', 'VERB', 'VB'),...etc] 
    """
    return list(map(lambda e: (e.text, e.pos_, e.tag_), nlp(text)))
  

def pos_frequency(text: Text) ->  List[Tuple[Text, int]]:
    """This function returns frequency counts of part of speech (POS) 
        in a text. It must return the POS and its frequency. 
        #NOTE: Refer to couting in Spacy, count the frequencies of the given attributes, make a list of the dictionary of the POS
               and counts, then sort the list. Also, make sure to sort your output by the key.
        =============
        Params:
            text 
        Return:
            A sorted list  of tuples of strings and integers sorted by the key. ex: [('ADV', 2),...etc]
    """
    return sorted(list(map(lambda i: (parsed.vocab[i[0]].text, i[1]), (parsed := nlp(text)).count_by(spacy.attrs.POS).items())))

    
def parse_dependency(text: Text) -> List[Tuple[Text, ...]]:
    """This function parse a single sentence.
    =============
        Params:
            text 
        Return:
            A list of tuples of strings. Ex: [('is', 'ROOT'), ...etc]
    """
    return list(map(lambda e: (e.text, e.dep_), nlp(text)))

def count_dependency(text: Text) -> List[Tuple[Text, int]]:
    """This function extracts the dependencies of sentences  and thier frequenciesin a text. 
        It must return a parsed dependency and its frequency.
        #NOTE: Refer to couting in Spacy, count the frequencies of the given attributes, make a  list of the dictionary of the DEP
               and counts, then sort the list. Also, make sure to sort your output by the key. 
        ============
        Params:
            text 
        Return:
            A sorted list of tuples of strings and integers sorted by the key. Ex: [('ROOT', 1),...etc]
    """
    return sorted(list(map(lambda i: (parsed.vocab[i[0]].text, i[1]), (parsed := nlp(text)).count_by(spacy.attrs.DEP).items())))

Bootstrap Resampling

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# BootstrapResampling.py

import random


class BootstrapResampling():
    """ This class  implements the non-parametric bootstrap resampling procedure discussed in class.
    """

    def getAverageBaselineScore(self, dataIn:list):
        """Given a list of dictionaries (dataIn) with key
            'baselineScore' (float), calculate the average baselineScore
            Example: [ {'question':"Question Text", 'answer':"Answer Text",
            'baselineScore':0.0, 'experimentalScore':1.0}, ... ]

            :param dataIn: List of dictionaries with key 'baselineScore'
            :return: Average 'baselineScore' across all elements in list.
        """
        return sum(map(lambda d: d['baselineScore'], dataIn))/len(dataIn)

    def getAverageExperimentalScore(self, dataIn:list):
        """Given a list of dictionaries (dataIn) with key
            'experimentalScore' (float), calculate the average baselineScore
            Example: [ {'question':"Question Text", 'answer':"Answer Text",
            'experimentalScore':0.0, 'experimentalScore':1.0}, ... ]

            :param dataIn: List of dictionaries with key 'experimentalScore'
            :return: Average 'experimentalScore' across all elements in list.
        """
        return sum(map(lambda d: d['experimentalScore'], dataIn))/len(dataIn)

    def createDifferenceScores(self, dataIn:list):
        """Given a list of dictionaries (dataIn) with keys 'baselineScore'
            and 'experimentalScore', calculate their difference scores
            (experimentalScore - baselineScore).
            Example: [ {'question':"Question Text", 'answer':"Answer Text",
            'experimentalScore':0.0, 'experimentalScore':1.0}, ... ]
            Example output: [1.0, ...]

            :param dataIn: List of dictionaries with float keys 'baselineScore', 'experimentalScore'
            :return: List of floats representing difference scores (experimental - baseline)
        """
        return [d['experimentalScore'] - d['baselineScore'] for d in dataIn]

    def generateOneResample(self, differenceScores:list):
        """Randomly resamples the difference scores, to make a bootstrapped resample
            Example input: [0, 1, 0, 0, 1, 0, 1, 1, 0]
            Example output: [1, 0, 1, 0, 0, 1, 0, 1, 1]

            :param differenceScores: A list of difference scores (floats).
            :return: A list of randomly resampled difference scores (floats),
                of the same length as the input, populated using random
                sampling with replacement.
        """
        return random.choices(differenceScores, k=len(differenceScores))

    def calculatePValue(self, dataIn:list, numResamples=10000):
        """Calculate the p-value of a dataset using the bootstrap resampling procedure.
            Example: [ {'question':"Question Text", 'answer':"Answer Text",
            'baselineScore':0.0, 'experimentalScore':1.0}, ... ]
            Example output: 0.01

            :param dataIn: List of dictionaries with float keys 'baselineScore', 'experimentalScore' populated
            :param numResamples: The number of bootstrap resamples to use (typically 10,000 or higher)
            :return: A value representing the p-value using the bootstrap resampling procedure (float)
        """
        return sum([sum(self.generateOneResample(self.createDifferenceScores(dataIn))) <= 0 for _ in range(numResamples)])/numResamples

Spam/NoSpam classification

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import re
from typing import Iterator, Iterable, Tuple, Text, Union

import numpy as np
from scipy.sparse import spmatrix

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

NDArray = Union[np.ndarray, spmatrix]


def read_smsspam(smsspam_path: str) -> Iterator[Tuple[Text, Text]]:
    """Generates (label, text) tuples from the lines in an SMSSpam file.

    SMSSpam files contain one message per line. Each line is composed of a label
    (ham or spam), a tab character, and the text of the SMS. Here are some
    examples:

      spam	85233 FREE>Ringtone!Reply REAL
      ham	I can take you at like noon
      ham	Where is it. Is there any opening for mca.

    :param smsspam_path: The path of an SMSSpam file, formatted as above.
    :return: An iterator over (label, text) tuples.
    """
    with open(smsspam_path, 'r') as file:
        return [(line.split('\t')[0], '\t'.join(line.split('\t')[1:])) for line in file.readlines()]


class TextToFeatures:
    def __init__(self, texts: Iterable[Text]):
        """Initializes an object for converting texts to features.

        During initialization, the provided training texts are analyzed to
        determine the vocabulary, i.e., all feature values that the converter
        will support. Each such feature value will be associated with a unique
        integer index that may later be accessed via the .index() method.

        It is up to the implementer exactly what features to produce from a
        text, but the features will always include some single words and some
        multi-word expressions (e.g., "need" and "to you").

        :param texts: The training texts.
        """
        self.model = CountVectorizer(ngram_range=(1,2), stop_words={"english"}, token_pattern=r'\b[a-zA-Z]+\b').fit(texts)


    def index(self, feature: Text):
        """Returns the index in the vocabulary of the given feature value.

        :param feature: A feature
        :return: The unique integer index associated with the feature.
        """
        return np.where(self.model.get_feature_names_out() == feature)[0][0]


    def __call__(self, texts: Iterable[Text]) -> NDArray:
        """Creates a feature matrix from a sequence of texts.

        Each row of the matrix corresponds to one of the input texts. The value
        at index j of row i is the value in the ith text of the feature
        associated with the unique integer j.

        It is up to the implementer what the value of a feature that is present
        in a text should be, though a common choice is 1. Features that are
        absent from a text will have the value 0.

        :param texts: A sequence of texts.
        :return: A matrix, with one row of feature values for each text.
        """
        vectorized = self.model.transform(texts).toarray()
        has_numbers = [
            [1 if re.search(re.compile(r'\d\d+'), text) else 0]
        for text in texts]
        features = np.concatenate((vectorized, has_numbers), axis=1)
        return features



class TextToLabels:
    def __init__(self, labels: Iterable[Text]):
        """Initializes an object for converting texts to labels.

        During initialization, the provided training labels are analyzed to
        determine the vocabulary, i.e., all labels that the converter will
        support. Each such label will be associated with a unique integer index
        that may later be accessed via the .index() method.

        :param labels: The training labels.
        """
        self.model = LabelEncoder().fit(labels)

    def index(self, label: Text) -> int:
        """Returns the index in the vocabulary of the given label.

        :param label: A label
        :return: The unique integer index associated with the label.
        """
        return np.where(self.model.classes_ == label)[0][0]

    def __call__(self, labels: Iterable[Text]) -> NDArray:
        """Creates a label vector from a sequence of labels.

        Each entry in the vector corresponds to one of the input labels. The
        value at index j is the unique integer associated with the jth label.

        :param labels: A sequence of labels.
        :return: A vector, with one entry for each label.
        """
        return self.model.transform(labels)


class Classifier:
    def __init__(self):
        """Initalizes a logistic regression classifier.
        """
        self.clf = LogisticRegression(
            random_state=0,
            solver="saga",
            penalty="elasticnet",
            l1_ratio=0.2
        )

    def train(self, features: NDArray, labels: NDArray) -> None:
        """Trains the classifier using the given training examples.

        :param features: A feature matrix, where each row represents a text.
        Such matrices will typically be generated via TextToFeatures.
        :param labels: A label vector, where each entry represents a label.
        Such vectors will typically be generated via TextToLabels.
        """
        self.clf.fit(features, labels)

    def predict(self, features: NDArray) -> NDArray:
        """Makes predictions for each of the given examples.

        :param features: A feature matrix, where each row represents a text.
        Such matrices will typically be generated via TextToFeatures.
        :return: A prediction vector, where each entry represents a label.
        """
        return self.clf.predict(features)

Dense and Sparse Vector and Document similarity

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import math
import re
from collections import defaultdict


class VecDense:

    def tokenizeDoc(self, oneDoc: str):
        """This method tokenizes a a string.

        :param oneDoc: a string.
        :return: a tokenized sting.
        """
        sanitizedStr = re.sub(r'[^a-zA-Z0-9 ]', '', oneDoc)
        tokens = sanitizedStr.lower().split(" ")
        return tokens

    def getVecLength(self, vecIn: list):
        """This method computes the length of a vector.

        :param vecIn: a list representing a vector, one element per dimension.
        :return: the length of the vector.
        """
        return math.sqrt(sum([i*i for i in vecIn]))

    def normalizeVec(self, vecIn:list):
        """This method normalizes a vector to unit length.

        :param vecIn:  a list representing a vector, one element per dimension.
        :return: a list representing a vector, that has been normalized to unit length.
        """
        return [i/self.getVecLength(vecIn) for i in vecIn]

    def dotProductVec(self, vecInA:list, vecInB:list):
        """This method takes the dot product of two vectors.

        :param vecInA, vecInB: two lists representing vectors,
            one element per dimension.
        :return: the dot product.
        """
        return sum([i*j for i,j in zip(vecInA, vecInB)])

    def cosine(self, vecInA: list, vecInB: list):
        """This method obtains the cosine between two vectors
            (which is nominally the dot product of two vectors of unit length).

        :param vecInA, vecInB: two lists representing vectors, one element per dimension.
        :return: the cosine.
        """
        return self.dotProductVec(vecInA, vecInB)/(self.getVecLength(vecInA)*self.getVecLength(vecInB))

    def computeCentroidVector(self, tokensIn:list, vecDict:dict):
        """This method calculates the centroid vector from a list of
            tokens. The centroid vector is the "average"
            vector of a list of tokens.
        #NOTE:  Special considerations:
            - all tokens should be converted to lower case.
            - if a vector isn't in the dictionary, it
                shouldn't be a part of the average.

        :param tokensIn: a list of tokens.
        :param vecDict: the vector library is a dictionary, 'vecDict',
            whose keys are tokens, and values are lists representing vectors.
        :return: the centroid vector, represented as a list.
        """
        vectors = [vecDict[token] for token in tokensIn if token in vecDict]

        total = [0 for _ in range(len(vectors[0]))]
        for vector in vectors:
            for i, el in enumerate(vector):
                total[i] += el

        return list(map(lambda d: d/len(vectors), total))

class VecSparseTFIDF:
    """This class calculates TF-IDF vector
    """
    def tokenizeDoc(self, oneDoc: str):
        """This method tokenizes a text.

        :param oneDoc: a string of text.
        :return: a tokenized text.
        """
        sanitizedStr = re.sub(r'[^a-zA-Z0-9 ]', '', oneDoc)
        tokens = sanitizedStr.lower().split(" ")
        return tokens

    def getTermFreq(self, oneDoc: str):
        """This method obtains term frequency.

        :param oneDoc: a string of text, called a document. e.g. "the cat saw the hat"
        :return: a defaultionary representing the term frequency
            counts in that document. Keys are tokens,
            values are counts.
        #NOTE: The input document should be tokenized using tokenizeDoc method.
        """
        termFreq = defaultdict(lambda: 0)
        for t in self.tokenizeDoc(oneDoc):
            termFreq[t] += 1
        return termFreq

    def getDocFreqs(self, allDocs: list):
        """This method obtains document frequencies.

        :param allDocs: a list of strings.  Each string is one document.
        :return: a default dictionary representing the document frequency
            counts across all documents. Keys are tokens,
            values are counts.
        """
        docFreq = defaultdict(lambda: 0)

        tokenSet = set()
        for document in allDocs:
            tokenSet |= set(self.tokenizeDoc(document))

        for token in tokenSet:
            for document in allDocs:
                if token in self.tokenizeDoc(document):
                    docFreq[token] += 1
        return docFreq

    def makeTFIDFVec(self, oneDoc: str, docFreqs: defaultdict, numDocs: int):
        """This method creates a TF-IDF vector for a given document.

        :param oneDoc: a string representing one document.
        :param docFreqs: a default dictionary representing the document
            frequency counts.  Keys are tokens, values are counts.
        :param numDocs: the total number of documents in the collection.
        :return: a default dictionary representing the tf-idf vector.
            Keys are tokens, values are counts.
        #NOTE: There are many ways to calculate tf-idf vectors.
           Term frequency should be the count of the words
            in a given document.
           Document frequency should be calculated with add-one
            smoothing, as log10(numDocs+1 / docFreqOfToken + 1).
        """
        vecOut = defaultdict(lambda: 0)
        for token, tf_i in self.getTermFreq(oneDoc).items():
            vecOut[token] = tf_i*(
                # math.log10(numDocs+1 / docFreqs[token] + 1)
                math.log10((numDocs+1)/(docFreqs[token]+1))
            )

        return vecOut

    def getVecLengthSparse(self, vecIn: defaultdict):
        """This method computes the length of a sparse vector.

        :param vecIn: a default dictionary representing a sparse
            vector. keys are tokens, values are counts, default = 0.
        :return:the length of the vector.
        """
        return math.sqrt(sum([val*val for val in vecIn.values()]))

    def normalizeVecSparse(self, vecIn: defaultdict):
        """This method normalizes a sparse vector to unit length.

        :param vecIn: a default dictionary representing a sparse vector.
          keys are tokens, values are counts, default = 0.
        :return: a list representing a vector, that has been
            normalized to unit length.
        """
        vecOut = defaultdict(lambda: 0)
        for token,val in vecIn.items():
            vecOut[token] = val/self.getVecLengthSparse(vecIn)
        return vecOut


    def dotProductVecSparse(self, vecInA: defaultdict, vecInB: defaultdict):
        """This method takes the dot product of two sparse vectors.

        :param vecInA, vecInB:two default dictionaries representing sparse
            vectors.  keys are tokens, values are counts, default = 0.
        :return: the dot product.
        """
        return sum([vecInA[token]*vecInB[token] for token in set(list(vecInA.keys()) + list(vecInB.keys()))])

    def cosineSparse(self, vecInA: defaultdict, vecInB: defaultdict):
        """This method obtains the cosine between two vectors (which
            is nominally the dot product of two vectors of unit length).

        :param vecInA, vecInB: two default dictionaries representing sparse
            vectors.  keys are tokens, values are counts, default = 0.
        :return: the cosine.
        """
        return self.dotProductVecSparse(vecInA, vecInB)/(self.getVecLengthSparse(vecInA)*self.getVecLengthSparse(vecInB))
        


def loadVectors(filename:str):
    """This function loads word vectors from the file.

    :param filename:  the filename of the vectors
        (e.g. glove.subset.50d.txt)
    :return: a dictionary, key: token, value: list of numbers loaded
        from the file.
    """
    wordVecs = {}
    print("Loading word vectors from file (" + str(filename) + ")")
    
    with open(filename, "r") as file:
        lines = file.readlines()
        for line in lines:
            wordVecs[line.split()[0]] = list(map(float, line.split()[1:]))

    print("Loaded " + str(len(wordVecs)) + " word vectors.")
    return wordVecs


def doQuestionAnsweringCentroidDense(questions:list, wordVecs:dict):
    """This function performs a baseline question answering task.
    This function implements a cosine similarity baseline question
    answering system for multiple choice questions. For each question,
    compute the centroid vector of the question text.
    Then, for each answer candidate, compute the cosine between the
    question text centroid vector, and that answer choice's centroid
    vector. Pick the answer choice that has the highest cosine
    similarity as the model's chosen answer. If the answer is correct,
     increment numCorrect. Return the total numCorrect.

    :param questions: a list of multiple questions, as included in the
        appropriate test.
    :param wordVecs: a dictionary of word vectors, loaded from
        'loadVectors()'.
    :return: the number of questions answered correctly.
    """
    vecDense = VecDense()

    # Evaluate QA performance of cosine model on questions
    numCorrect = 0
    for question in questions:
        best_choice = (-1, -1)
        question_centroid = vecDense.computeCentroidVector(vecDense.tokenizeDoc(question["question"]), wordVecs)
        for i, answer in enumerate(question["choices"]):
            answer_centroid = vecDense.computeCentroidVector(vecDense.tokenizeDoc(answer), wordVecs)
            cosine = vecDense.cosine(question_centroid, answer_centroid)
            if cosine > best_choice[1]:
                best_choice = (i, cosine)
        if best_choice[0] == question["correctIdx"]:
            numCorrect += 1

    return numCorrect

Maximum Entropy Markov Model

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from typing import Iterator, Sequence, Text, Tuple, Union

from itertools import groupby

import numpy as np
from scipy.sparse import spmatrix

from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier

NDArray = Union[np.ndarray, spmatrix]
TokenSeq = Sequence[Text]
PosSeq = Sequence[Text]

def read_ptbtagged(ptbtagged_path: str) -> Iterator[Tuple[TokenSeq, PosSeq]]:
    """Reads sentences from a Penn TreeBank .tagged file.
    Each sentence is a sequence of tokens and part-of-speech tags.

    Penn TreeBank .tagged files contain one token per line, with an empty line
    marking the end of each sentence. Each line is composed of a token, a tab
    character, and a part-of-speech tag. Here is an example:

        What	WP
        's	VBZ
        next	JJ
        ?	.

        Slides	NNS
        to	TO
        illustrate	VB
        Shostakovich	NNP
        quartets	NNS
        ?	.

    :param ptbtagged_path: The path of a Penn TreeBank .tagged file, formatted
    as above.
    :return: An iterator over sentences, where each sentence is a tuple of
    a sequence of tokens and a corresponding sequence of part-of-speech tags.
    """
    with open(ptbtagged_path, 'r') as f:
        return [([word.strip().split()[0].strip() for word in group], [word.strip().split()[1].strip() for word in group]) for group in [list(j) for i, j in groupby(f.readlines(), key=lambda d: d.strip() == "") if not i]]




class Classifier(object):
    def __init__(self):
        """Initializes the classifier."""
        self.token_vectorizer = DictVectorizer(sparse=False)
        self.pos_labeler = preprocessing.LabelEncoder()
        # self.clf = LogisticRegression(n_jobs=-1, random_state=42, penalty='elasticnet', solver='saga', l1_ratio=0.2)
        # self.clf = LogisticRegression()
        self.clf = MLPClassifier(random_state=42, max_iter=1000, hidden_layer_sizes=(100,), learning_rate='adaptive')
        # self.stemmer = nltk.stem.SnowballStemmer('english')

    def suffix_pos_feature(self, token, suffix):
        return 1 if token.lower().endswith(suffix) else 0

    def get_features(self, token, prev_token, next_token, prev_pos, prev2_pos):
        return {
            # 'token': self.stemmer.stem(token),
            'token': token,
            # 'token-1': prev_token,
            # 'token+1': next_token,
            'pos-1': prev_pos,
            # 'pos-2': prev2_pos,
            'first-capital': 1 if token[0].isupper() else 0,
            'end-ing': self.suffix_pos_feature(token, 'ing'),
            'end-ion': self.suffix_pos_feature(token, 'ion'),
            'end-able': self.suffix_pos_feature(token, 'able'),
            'end-ance': self.suffix_pos_feature(token, 'ance'),
            'end-tion': self.suffix_pos_feature(token, 'tion'),
            'end-sion': self.suffix_pos_feature(token, 'sion'),
            'end-ment': self.suffix_pos_feature(token, 'ment'),
            'end-ure': self.suffix_pos_feature(token, 'ure'),
            'end-ity': self.suffix_pos_feature(token, 'ity'),
            'end-age': self.suffix_pos_feature(token, 'age'),
            'end-ant': self.suffix_pos_feature(token, 'ant'),
            'end-ent': self.suffix_pos_feature(token, 'ent'),
            'end-ive': self.suffix_pos_feature(token, 'ive'),
            'end-cial': self.suffix_pos_feature(token, 'cial'),
            'end-tial': self.suffix_pos_feature(token, 'tial'),
            'end-ous': self.suffix_pos_feature(token, 'ous'),
            'end-ic': self.suffix_pos_feature(token, 'ic'),
            'end-en': self.suffix_pos_feature(token, 'en'),
            'end-olve': self.suffix_pos_feature(token, 'olve'),
            'end-ide': self.suffix_pos_feature(token, 'ide'),
            'end-ise': self.suffix_pos_feature(token, 'ise'),
            'end-acy': self.suffix_pos_feature(token, 'acy'),
            'end-age': self.suffix_pos_feature(token, 'age'),
            'end-ence': self.suffix_pos_feature(token, 'ence'),
            'end-hood': self.suffix_pos_feature(token, 'hood'),
            'end-ism': self.suffix_pos_feature(token, 'ism'),
            'end-ful': self.suffix_pos_feature(token, 'ful'),
            'end-ly': self.suffix_pos_feature(token, 'ly'),
            'end-ish': self.suffix_pos_feature(token, 'ish'),
            'end-like': self.suffix_pos_feature(token, 'like'),
            'end-ed': self.suffix_pos_feature(token, 'ed'),
            'end-s': self.suffix_pos_feature(token, 's'),
            'end-ible': self.suffix_pos_feature(token, 'ible'),
        }

    def train(self, tagged_sentences: Iterator[Tuple[TokenSeq, PosSeq]]) -> Tuple[NDArray, NDArray]:
        """Trains the classifier on the part-of-speech tagged sentences,
        and returns the feature matrix and label vector on which it was trained.

        The feature matrix should have one row per training token. The number
        of columns is up to the implementation, but there must at least be 1
        feature for each token, named "token=T", where "T" is the token string,
        and one feature for the part-of-speech tag of the preceding token,
        named "pos-1=P", where "P" is the part-of-speech tag string, or "<s>" if
        the token was the first in the sentence. For example, if the input is:

            What	WP
            's	VBZ
            next	JJ
            ?	.

        Then the first row in the feature matrix should have features for
        "token=What" and "pos-1=<s>", the second row in the feature matrix
        should have features for "token='s" and "pos-1=WP", etc. The alignment
        between these feature names and the integer columns of the feature
        matrix is given by the `feature_index` method below.

        The label vector should have one entry per training token, and each
        entry should be an integer. The alignment between part-of-speech tag
        strings and the integers in the label vector is given by the
        `label_index` method below.

        :param tagged_sentences: An iterator over sentences, where each sentence
        is a tuple of a sequence of tokens and a corresponding sequence of
        part-of-speech tags.
        :return: A tuple of (feature-matrix, label-vector).
        """
        features = []
        labels = []
        for tokens, poses in tagged_sentences:
            features += [self.get_features(token, tokens[i-1] if i > 0 else "", tokens[i+1] if i < len(tokens)-1 else "", poses[i-1] if i > 0 else "<s>", poses[i-2] if i > 1 else "<s>") for i, token in enumerate(tokens)]
            labels += poses
        
        features, labels = self.token_vectorizer.fit_transform(features), self.pos_labeler.fit_transform(labels)
        self.clf.fit(
            features,
            labels
        )

        return features, labels
        


    def feature_index(self, feature: Text) -> int:
        """Returns the column index corresponding to the given named feature.

        The `train` method should always be called before this method is called.

        :param feature: The string name of a feature.
        :return: The column index of the feature in the feature matrix returned
        by the `train` method.
        """
        return np.where(self.token_vectorizer.get_feature_names_out() == feature)[0].squeeze()

    def label_index(self, label: Text) -> int:
        """Returns the integer corresponding to the given part-of-speech tag

        The `train` method should always be called before this method is called.

        :param label: The part-of-speech tag string.
        :return: The integer for the part-of-speech tag, to be used in the label
        vector returned by the `train` method.
        """
        return self.pos_labeler.transform([label])[0]

    def predict(self, tokens: TokenSeq) -> PosSeq:
        """Predicts part-of-speech tags for the sequence of tokens.

        This method delegates to either `predict_greedy` or `predict_viterbi`.
        The implementer may decide which one to delegate to.

        :param tokens: A sequence of tokens representing a sentence.
        :return: A sequence of part-of-speech tags, one for each token.
        """
        _, pos_tags = self.predict_greedy(tokens)
        # _, _, pos_tags = self.predict_viterbi(tokens)
        return pos_tags

    def predict_greedy(self, tokens: TokenSeq) -> Tuple[NDArray, PosSeq]:
        """Predicts part-of-speech tags for the sequence of tokens using a
        greedy algorithm, and returns the feature matrix and predicted tags.

        Each part-of-speech tag is predicted one at a time, and each prediction
        is considered a hard decision, that is, when predicting the
        part-of-speech tag for token i, the model will assume that its
        prediction for token i-1 is correct and unchangeable.

        The feature matrix should have one row per input token, and be formatted
        in the same way as the feature matrix in `train`.

        :param tokens: A sequence of tokens representing a sentence.
        :return: The feature matrix and the sequence of predicted part-of-speech
        tags (one for each input token).
        """
        features = []
        poses = []
        for i, token in enumerate(tokens):
            features.append(self.token_vectorizer.transform(self.get_features(token, tokens[i-1] if i > 0 else "", tokens[i+1] if i < len(tokens)-1 else "", poses[i-1] if i > 0 else "<s>", poses[i-2] if i > 1 else "<s>")))
            poses.append(self.pos_labeler.inverse_transform(self.clf.predict(features[-1]))[0])

        return np.array(features).squeeze(), np.array(poses).squeeze()