Statistical Natural Language Processing

📚 CSC 539 3 Credits 🎯 academic

    This course is taught by Peter Jansen.

    An advanced introduction to Statistical Natural Language Processing (NLP), including information theory, part-of-speech tagging, parsing, machine translation, machine learning, and information retrieval. Emphasizes implementing foundational algorithms and representations for common NLP tasks from the ground up.

    The course had a lot of programming assignments and helped us learn different applications of NLP with python. Some of the programming assignments with implementation is given below:

    POS tagging and Named Entity Recognition

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    
    from typing import List, Set, Dict, Tuple, Optional, Text
    from operator import itemgetter
    
    # import spacy package.
    import spacy
    
    
    # Please fill in the following varibales. 
    Name="Md Rahat-uz- Zaman"
    Email="rahatzamancse@email.arizona.edu"
    Collaborator=""
    
    # load  the small english core web model from spacy.
    nlp = spacy.load("en_core_web_sm")
    
    
    def tokenize_text(text: Text) -> List[Text]:
        """This functions tokenize a text by iterating 
                over its tokens by using spacy English tokenizer.
            =============
            Params:
                text
            Return:
                A list of tokenized items. ["token1, "token2"...etc]
        """
        return list(map(lambda e: e.text, nlp.tokenizer(text)))
        
        
    def recognize_name_entity(text: Text) -> List[Text]:
        """This function recognizes name entities in a text by using Spacy
            English tokenizer.
            #NOTE: Please read about converting span (span1, span2..etc)
                    into list of strings ["span1", "span2"...etc].
            =============
            Params:
                text 
            Return:
                A list of tokenized name entities. ex: ['entity1', 'entity2'...etc] 
        """ 
        return list(map(lambda e: e.text, nlp(text).ents))
     
    
    def get_entity_labels(text: Text) -> List[Tuple[Text, Text]]:
        """This function obtains the labels of entities.
            #NOTE: Please read about the types of label methods in spacy. You should 
                    return a string label rahter than an integer label. 
            =============
            Params:
                text 
            :Return:
                A list of tuples  of entities and its labels. 
                ex: [("entity1", "label1"), ("entity2", "label2")...etc]
        """
        return list(map(lambda e: (e.text, e.label_), nlp(text).ents))
    
        
    def get_lemmas(text: Text) -> List[Tuple[Text, Text]]:
        """This function finds  lemmas in a text by using Spacy 
            English tokenizer. It must return a token and its lemma.
            #NOTE: Please o read about the types of lemma methods in spacy. Your should  
                    return a string lemma rahter than an integer lemma. 
            =============
            :Params:
                text 
            :Return:
                A list of of tuples of tokens and their stem. 
                ex: [('tokens', 'token'), ...etc] 
        """
        return list(map(lambda e: (e.text, e.lemma_), nlp(text)))
       
    
    def get_POS_tags(text: Text) ->  List[Tuple[Text, ...]]:
        """This function obtains  with its the associated POS and tags of each token 
            in a text by using Spacy tags.  It must return the verb itself, 
            part of of speech (POS), and the associated tag. 
            #NOTE: Read Spacy POS and assciated tags. 
        =============
            Params:
                text
            Return:
                A list of tuples of strings. ex: [('get', 'VERB', 'VB'),...etc] 
        """
        return list(map(lambda e: (e.text, e.pos_, e.tag_), nlp(text)))
      
    
    def pos_frequency(text: Text) ->  List[Tuple[Text, int]]:
        """This function returns frequency counts of part of speech (POS) 
            in a text. It must return the POS and its frequency. 
            #NOTE: Refer to couting in Spacy, count the frequencies of the given attributes, make a list of the dictionary of the POS
                   and counts, then sort the list. Also, make sure to sort your output by the key.
            =============
            Params:
                text 
            Return:
                A sorted list  of tuples of strings and integers sorted by the key. ex: [('ADV', 2),...etc]
        """
        return sorted(list(map(lambda i: (parsed.vocab[i[0]].text, i[1]), (parsed := nlp(text)).count_by(spacy.attrs.POS).items())))
    
        
    def parse_dependency(text: Text) -> List[Tuple[Text, ...]]:
        """This function parse a single sentence.
        =============
            Params:
                text 
            Return:
                A list of tuples of strings. Ex: [('is', 'ROOT'), ...etc]
        """
        return list(map(lambda e: (e.text, e.dep_), nlp(text)))
    
    def count_dependency(text: Text) -> List[Tuple[Text, int]]:
        """This function extracts the dependencies of sentences  and thier frequenciesin a text. 
            It must return a parsed dependency and its frequency.
            #NOTE: Refer to couting in Spacy, count the frequencies of the given attributes, make a  list of the dictionary of the DEP
                   and counts, then sort the list. Also, make sure to sort your output by the key. 
            ============
            Params:
                text 
            Return:
                A sorted list of tuples of strings and integers sorted by the key. Ex: [('ROOT', 1),...etc]
        """
        return sorted(list(map(lambda i: (parsed.vocab[i[0]].text, i[1]), (parsed := nlp(text)).count_by(spacy.attrs.DEP).items())))
    

    Bootstrap Resampling

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    
    # BootstrapResampling.py
    
    import random
    
    
    class BootstrapResampling():
        """ This class  implements the non-parametric bootstrap resampling procedure discussed in class.
        """
    
        def getAverageBaselineScore(self, dataIn:list):
            """Given a list of dictionaries (dataIn) with key
                'baselineScore' (float), calculate the average baselineScore
                Example: [ {'question':"Question Text", 'answer':"Answer Text",
                'baselineScore':0.0, 'experimentalScore':1.0}, ... ]
    
                :param dataIn: List of dictionaries with key 'baselineScore'
                :return: Average 'baselineScore' across all elements in list.
            """
            return sum(map(lambda d: d['baselineScore'], dataIn))/len(dataIn)
    
        def getAverageExperimentalScore(self, dataIn:list):
            """Given a list of dictionaries (dataIn) with key
                'experimentalScore' (float), calculate the average baselineScore
                Example: [ {'question':"Question Text", 'answer':"Answer Text",
                'experimentalScore':0.0, 'experimentalScore':1.0}, ... ]
    
                :param dataIn: List of dictionaries with key 'experimentalScore'
                :return: Average 'experimentalScore' across all elements in list.
            """
            return sum(map(lambda d: d['experimentalScore'], dataIn))/len(dataIn)
    
        def createDifferenceScores(self, dataIn:list):
            """Given a list of dictionaries (dataIn) with keys 'baselineScore'
                and 'experimentalScore', calculate their difference scores
                (experimentalScore - baselineScore).
                Example: [ {'question':"Question Text", 'answer':"Answer Text",
                'experimentalScore':0.0, 'experimentalScore':1.0}, ... ]
                Example output: [1.0, ...]
    
                :param dataIn: List of dictionaries with float keys 'baselineScore', 'experimentalScore'
                :return: List of floats representing difference scores (experimental - baseline)
            """
            return [d['experimentalScore'] - d['baselineScore'] for d in dataIn]
    
        def generateOneResample(self, differenceScores:list):
            """Randomly resamples the difference scores, to make a bootstrapped resample
                Example input: [0, 1, 0, 0, 1, 0, 1, 1, 0]
                Example output: [1, 0, 1, 0, 0, 1, 0, 1, 1]
    
                :param differenceScores: A list of difference scores (floats).
                :return: A list of randomly resampled difference scores (floats),
                    of the same length as the input, populated using random
                    sampling with replacement.
            """
            return random.choices(differenceScores, k=len(differenceScores))
    
        def calculatePValue(self, dataIn:list, numResamples=10000):
            """Calculate the p-value of a dataset using the bootstrap resampling procedure.
                Example: [ {'question':"Question Text", 'answer':"Answer Text",
                'baselineScore':0.0, 'experimentalScore':1.0}, ... ]
                Example output: 0.01
    
                :param dataIn: List of dictionaries with float keys 'baselineScore', 'experimentalScore' populated
                :param numResamples: The number of bootstrap resamples to use (typically 10,000 or higher)
                :return: A value representing the p-value using the bootstrap resampling procedure (float)
            """
            return sum([sum(self.generateOneResample(self.createDifferenceScores(dataIn))) <= 0 for _ in range(numResamples)])/numResamples
    

    Spam/NoSpam classification

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    
    import re
    from typing import Iterator, Iterable, Tuple, Text, Union
    
    import numpy as np
    from scipy.sparse import spmatrix
    
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.linear_model import LogisticRegression
    
    NDArray = Union[np.ndarray, spmatrix]
    
    
    def read_smsspam(smsspam_path: str) -> Iterator[Tuple[Text, Text]]:
        """Generates (label, text) tuples from the lines in an SMSSpam file.
    
        SMSSpam files contain one message per line. Each line is composed of a label
        (ham or spam), a tab character, and the text of the SMS. Here are some
        examples:
    
          spam	85233 FREE>Ringtone!Reply REAL
          ham	I can take you at like noon
          ham	Where is it. Is there any opening for mca.
    
        :param smsspam_path: The path of an SMSSpam file, formatted as above.
        :return: An iterator over (label, text) tuples.
        """
        with open(smsspam_path, 'r') as file:
            return [(line.split('\t')[0], '\t'.join(line.split('\t')[1:])) for line in file.readlines()]
    
    
    class TextToFeatures:
        def __init__(self, texts: Iterable[Text]):
            """Initializes an object for converting texts to features.
    
            During initialization, the provided training texts are analyzed to
            determine the vocabulary, i.e., all feature values that the converter
            will support. Each such feature value will be associated with a unique
            integer index that may later be accessed via the .index() method.
    
            It is up to the implementer exactly what features to produce from a
            text, but the features will always include some single words and some
            multi-word expressions (e.g., "need" and "to you").
    
            :param texts: The training texts.
            """
            self.model = CountVectorizer(ngram_range=(1,2), stop_words={"english"}, token_pattern=r'\b[a-zA-Z]+\b').fit(texts)
    
    
        def index(self, feature: Text):
            """Returns the index in the vocabulary of the given feature value.
    
            :param feature: A feature
            :return: The unique integer index associated with the feature.
            """
            return np.where(self.model.get_feature_names_out() == feature)[0][0]
    
    
        def __call__(self, texts: Iterable[Text]) -> NDArray:
            """Creates a feature matrix from a sequence of texts.
    
            Each row of the matrix corresponds to one of the input texts. The value
            at index j of row i is the value in the ith text of the feature
            associated with the unique integer j.
    
            It is up to the implementer what the value of a feature that is present
            in a text should be, though a common choice is 1. Features that are
            absent from a text will have the value 0.
    
            :param texts: A sequence of texts.
            :return: A matrix, with one row of feature values for each text.
            """
            vectorized = self.model.transform(texts).toarray()
            has_numbers = [
                [1 if re.search(re.compile(r'\d\d+'), text) else 0]
            for text in texts]
            features = np.concatenate((vectorized, has_numbers), axis=1)
            return features
    
    
    
    class TextToLabels:
        def __init__(self, labels: Iterable[Text]):
            """Initializes an object for converting texts to labels.
    
            During initialization, the provided training labels are analyzed to
            determine the vocabulary, i.e., all labels that the converter will
            support. Each such label will be associated with a unique integer index
            that may later be accessed via the .index() method.
    
            :param labels: The training labels.
            """
            self.model = LabelEncoder().fit(labels)
    
        def index(self, label: Text) -> int:
            """Returns the index in the vocabulary of the given label.
    
            :param label: A label
            :return: The unique integer index associated with the label.
            """
            return np.where(self.model.classes_ == label)[0][0]
    
        def __call__(self, labels: Iterable[Text]) -> NDArray:
            """Creates a label vector from a sequence of labels.
    
            Each entry in the vector corresponds to one of the input labels. The
            value at index j is the unique integer associated with the jth label.
    
            :param labels: A sequence of labels.
            :return: A vector, with one entry for each label.
            """
            return self.model.transform(labels)
    
    
    class Classifier:
        def __init__(self):
            """Initalizes a logistic regression classifier.
            """
            self.clf = LogisticRegression(
                random_state=0,
                solver="saga",
                penalty="elasticnet",
                l1_ratio=0.2
            )
    
        def train(self, features: NDArray, labels: NDArray) -> None:
            """Trains the classifier using the given training examples.
    
            :param features: A feature matrix, where each row represents a text.
            Such matrices will typically be generated via TextToFeatures.
            :param labels: A label vector, where each entry represents a label.
            Such vectors will typically be generated via TextToLabels.
            """
            self.clf.fit(features, labels)
    
        def predict(self, features: NDArray) -> NDArray:
            """Makes predictions for each of the given examples.
    
            :param features: A feature matrix, where each row represents a text.
            Such matrices will typically be generated via TextToFeatures.
            :return: A prediction vector, where each entry represents a label.
            """
            return self.clf.predict(features)
    

    Dense and Sparse Vector and Document similarity

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    
    import math
    import re
    from collections import defaultdict
    
    
    class VecDense:
    
        def tokenizeDoc(self, oneDoc: str):
            """This method tokenizes a a string.
    
            :param oneDoc: a string.
            :return: a tokenized sting.
            """
            sanitizedStr = re.sub(r'[^a-zA-Z0-9 ]', '', oneDoc)
            tokens = sanitizedStr.lower().split(" ")
            return tokens
    
        def getVecLength(self, vecIn: list):
            """This method computes the length of a vector.
    
            :param vecIn: a list representing a vector, one element per dimension.
            :return: the length of the vector.
            """
            return math.sqrt(sum([i*i for i in vecIn]))
    
        def normalizeVec(self, vecIn:list):
            """This method normalizes a vector to unit length.
    
            :param vecIn:  a list representing a vector, one element per dimension.
            :return: a list representing a vector, that has been normalized to unit length.
            """
            return [i/self.getVecLength(vecIn) for i in vecIn]
    
        def dotProductVec(self, vecInA:list, vecInB:list):
            """This method takes the dot product of two vectors.
    
            :param vecInA, vecInB: two lists representing vectors,
                one element per dimension.
            :return: the dot product.
            """
            return sum([i*j for i,j in zip(vecInA, vecInB)])
    
        def cosine(self, vecInA: list, vecInB: list):
            """This method obtains the cosine between two vectors
                (which is nominally the dot product of two vectors of unit length).
    
            :param vecInA, vecInB: two lists representing vectors, one element per dimension.
            :return: the cosine.
            """
            return self.dotProductVec(vecInA, vecInB)/(self.getVecLength(vecInA)*self.getVecLength(vecInB))
    
        def computeCentroidVector(self, tokensIn:list, vecDict:dict):
            """This method calculates the centroid vector from a list of
                tokens. The centroid vector is the "average"
                vector of a list of tokens.
            #NOTE:  Special considerations:
                - all tokens should be converted to lower case.
                - if a vector isn't in the dictionary, it
                    shouldn't be a part of the average.
    
            :param tokensIn: a list of tokens.
            :param vecDict: the vector library is a dictionary, 'vecDict',
                whose keys are tokens, and values are lists representing vectors.
            :return: the centroid vector, represented as a list.
            """
            vectors = [vecDict[token] for token in tokensIn if token in vecDict]
    
            total = [0 for _ in range(len(vectors[0]))]
            for vector in vectors:
                for i, el in enumerate(vector):
                    total[i] += el
    
            return list(map(lambda d: d/len(vectors), total))
    
    class VecSparseTFIDF:
        """This class calculates TF-IDF vector
        """
        def tokenizeDoc(self, oneDoc: str):
            """This method tokenizes a text.
    
            :param oneDoc: a string of text.
            :return: a tokenized text.
            """
            sanitizedStr = re.sub(r'[^a-zA-Z0-9 ]', '', oneDoc)
            tokens = sanitizedStr.lower().split(" ")
            return tokens
    
        def getTermFreq(self, oneDoc: str):
            """This method obtains term frequency.
    
            :param oneDoc: a string of text, called a document. e.g. "the cat saw the hat"
            :return: a defaultionary representing the term frequency
                counts in that document. Keys are tokens,
                values are counts.
            #NOTE: The input document should be tokenized using tokenizeDoc method.
            """
            termFreq = defaultdict(lambda: 0)
            for t in self.tokenizeDoc(oneDoc):
                termFreq[t] += 1
            return termFreq
    
        def getDocFreqs(self, allDocs: list):
            """This method obtains document frequencies.
    
            :param allDocs: a list of strings.  Each string is one document.
            :return: a default dictionary representing the document frequency
                counts across all documents. Keys are tokens,
                values are counts.
            """
            docFreq = defaultdict(lambda: 0)
    
            tokenSet = set()
            for document in allDocs:
                tokenSet |= set(self.tokenizeDoc(document))
    
            for token in tokenSet:
                for document in allDocs:
                    if token in self.tokenizeDoc(document):
                        docFreq[token] += 1
            return docFreq
    
        def makeTFIDFVec(self, oneDoc: str, docFreqs: defaultdict, numDocs: int):
            """This method creates a TF-IDF vector for a given document.
    
            :param oneDoc: a string representing one document.
            :param docFreqs: a default dictionary representing the document
                frequency counts.  Keys are tokens, values are counts.
            :param numDocs: the total number of documents in the collection.
            :return: a default dictionary representing the tf-idf vector.
                Keys are tokens, values are counts.
            #NOTE: There are many ways to calculate tf-idf vectors.
               Term frequency should be the count of the words
                in a given document.
               Document frequency should be calculated with add-one
                smoothing, as log10(numDocs+1 / docFreqOfToken + 1).
            """
            vecOut = defaultdict(lambda: 0)
            for token, tf_i in self.getTermFreq(oneDoc).items():
                vecOut[token] = tf_i*(
                    # math.log10(numDocs+1 / docFreqs[token] + 1)
                    math.log10((numDocs+1)/(docFreqs[token]+1))
                )
    
            return vecOut
    
        def getVecLengthSparse(self, vecIn: defaultdict):
            """This method computes the length of a sparse vector.
    
            :param vecIn: a default dictionary representing a sparse
                vector. keys are tokens, values are counts, default = 0.
            :return:the length of the vector.
            """
            return math.sqrt(sum([val*val for val in vecIn.values()]))
    
        def normalizeVecSparse(self, vecIn: defaultdict):
            """This method normalizes a sparse vector to unit length.
    
            :param vecIn: a default dictionary representing a sparse vector.
              keys are tokens, values are counts, default = 0.
            :return: a list representing a vector, that has been
                normalized to unit length.
            """
            vecOut = defaultdict(lambda: 0)
            for token,val in vecIn.items():
                vecOut[token] = val/self.getVecLengthSparse(vecIn)
            return vecOut
    
    
        def dotProductVecSparse(self, vecInA: defaultdict, vecInB: defaultdict):
            """This method takes the dot product of two sparse vectors.
    
            :param vecInA, vecInB:two default dictionaries representing sparse
                vectors.  keys are tokens, values are counts, default = 0.
            :return: the dot product.
            """
            return sum([vecInA[token]*vecInB[token] for token in set(list(vecInA.keys()) + list(vecInB.keys()))])
    
        def cosineSparse(self, vecInA: defaultdict, vecInB: defaultdict):
            """This method obtains the cosine between two vectors (which
                is nominally the dot product of two vectors of unit length).
    
            :param vecInA, vecInB: two default dictionaries representing sparse
                vectors.  keys are tokens, values are counts, default = 0.
            :return: the cosine.
            """
            return self.dotProductVecSparse(vecInA, vecInB)/(self.getVecLengthSparse(vecInA)*self.getVecLengthSparse(vecInB))
            
    
    
    def loadVectors(filename:str):
        """This function loads word vectors from the file.
    
        :param filename:  the filename of the vectors
            (e.g. glove.subset.50d.txt)
        :return: a dictionary, key: token, value: list of numbers loaded
            from the file.
        """
        wordVecs = {}
        print("Loading word vectors from file (" + str(filename) + ")")
        
        with open(filename, "r") as file:
            lines = file.readlines()
            for line in lines:
                wordVecs[line.split()[0]] = list(map(float, line.split()[1:]))
    
        print("Loaded " + str(len(wordVecs)) + " word vectors.")
        return wordVecs
    
    
    def doQuestionAnsweringCentroidDense(questions:list, wordVecs:dict):
        """This function performs a baseline question answering task.
        This function implements a cosine similarity baseline question
        answering system for multiple choice questions. For each question,
        compute the centroid vector of the question text.
        Then, for each answer candidate, compute the cosine between the
        question text centroid vector, and that answer choice's centroid
        vector. Pick the answer choice that has the highest cosine
        similarity as the model's chosen answer. If the answer is correct,
         increment numCorrect. Return the total numCorrect.
    
        :param questions: a list of multiple questions, as included in the
            appropriate test.
        :param wordVecs: a dictionary of word vectors, loaded from
            'loadVectors()'.
        :return: the number of questions answered correctly.
        """
        vecDense = VecDense()
    
        # Evaluate QA performance of cosine model on questions
        numCorrect = 0
        for question in questions:
            best_choice = (-1, -1)
            question_centroid = vecDense.computeCentroidVector(vecDense.tokenizeDoc(question["question"]), wordVecs)
            for i, answer in enumerate(question["choices"]):
                answer_centroid = vecDense.computeCentroidVector(vecDense.tokenizeDoc(answer), wordVecs)
                cosine = vecDense.cosine(question_centroid, answer_centroid)
                if cosine > best_choice[1]:
                    best_choice = (i, cosine)
            if best_choice[0] == question["correctIdx"]:
                numCorrect += 1
    
        return numCorrect
    

    Maximum Entropy Markov Model

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    
    from typing import Iterator, Sequence, Text, Tuple, Union
    
    from itertools import groupby
    
    import numpy as np
    from scipy.sparse import spmatrix
    
    from sklearn.feature_extraction import DictVectorizer
    from sklearn import preprocessing
    from sklearn.neural_network import MLPClassifier
    
    NDArray = Union[np.ndarray, spmatrix]
    TokenSeq = Sequence[Text]
    PosSeq = Sequence[Text]
    
    def read_ptbtagged(ptbtagged_path: str) -> Iterator[Tuple[TokenSeq, PosSeq]]:
        """Reads sentences from a Penn TreeBank .tagged file.
        Each sentence is a sequence of tokens and part-of-speech tags.
    
        Penn TreeBank .tagged files contain one token per line, with an empty line
        marking the end of each sentence. Each line is composed of a token, a tab
        character, and a part-of-speech tag. Here is an example:
    
            What	WP
            's	VBZ
            next	JJ
            ?	.
    
            Slides	NNS
            to	TO
            illustrate	VB
            Shostakovich	NNP
            quartets	NNS
            ?	.
    
        :param ptbtagged_path: The path of a Penn TreeBank .tagged file, formatted
        as above.
        :return: An iterator over sentences, where each sentence is a tuple of
        a sequence of tokens and a corresponding sequence of part-of-speech tags.
        """
        with open(ptbtagged_path, 'r') as f:
            return [([word.strip().split()[0].strip() for word in group], [word.strip().split()[1].strip() for word in group]) for group in [list(j) for i, j in groupby(f.readlines(), key=lambda d: d.strip() == "") if not i]]
    
    
    
    
    class Classifier(object):
        def __init__(self):
            """Initializes the classifier."""
            self.token_vectorizer = DictVectorizer(sparse=False)
            self.pos_labeler = preprocessing.LabelEncoder()
            # self.clf = LogisticRegression(n_jobs=-1, random_state=42, penalty='elasticnet', solver='saga', l1_ratio=0.2)
            # self.clf = LogisticRegression()
            self.clf = MLPClassifier(random_state=42, max_iter=1000, hidden_layer_sizes=(100,), learning_rate='adaptive')
            # self.stemmer = nltk.stem.SnowballStemmer('english')
    
        def suffix_pos_feature(self, token, suffix):
            return 1 if token.lower().endswith(suffix) else 0
    
        def get_features(self, token, prev_token, next_token, prev_pos, prev2_pos):
            return {
                # 'token': self.stemmer.stem(token),
                'token': token,
                # 'token-1': prev_token,
                # 'token+1': next_token,
                'pos-1': prev_pos,
                # 'pos-2': prev2_pos,
                'first-capital': 1 if token[0].isupper() else 0,
                'end-ing': self.suffix_pos_feature(token, 'ing'),
                'end-ion': self.suffix_pos_feature(token, 'ion'),
                'end-able': self.suffix_pos_feature(token, 'able'),
                'end-ance': self.suffix_pos_feature(token, 'ance'),
                'end-tion': self.suffix_pos_feature(token, 'tion'),
                'end-sion': self.suffix_pos_feature(token, 'sion'),
                'end-ment': self.suffix_pos_feature(token, 'ment'),
                'end-ure': self.suffix_pos_feature(token, 'ure'),
                'end-ity': self.suffix_pos_feature(token, 'ity'),
                'end-age': self.suffix_pos_feature(token, 'age'),
                'end-ant': self.suffix_pos_feature(token, 'ant'),
                'end-ent': self.suffix_pos_feature(token, 'ent'),
                'end-ive': self.suffix_pos_feature(token, 'ive'),
                'end-cial': self.suffix_pos_feature(token, 'cial'),
                'end-tial': self.suffix_pos_feature(token, 'tial'),
                'end-ous': self.suffix_pos_feature(token, 'ous'),
                'end-ic': self.suffix_pos_feature(token, 'ic'),
                'end-en': self.suffix_pos_feature(token, 'en'),
                'end-olve': self.suffix_pos_feature(token, 'olve'),
                'end-ide': self.suffix_pos_feature(token, 'ide'),
                'end-ise': self.suffix_pos_feature(token, 'ise'),
                'end-acy': self.suffix_pos_feature(token, 'acy'),
                'end-age': self.suffix_pos_feature(token, 'age'),
                'end-ence': self.suffix_pos_feature(token, 'ence'),
                'end-hood': self.suffix_pos_feature(token, 'hood'),
                'end-ism': self.suffix_pos_feature(token, 'ism'),
                'end-ful': self.suffix_pos_feature(token, 'ful'),
                'end-ly': self.suffix_pos_feature(token, 'ly'),
                'end-ish': self.suffix_pos_feature(token, 'ish'),
                'end-like': self.suffix_pos_feature(token, 'like'),
                'end-ed': self.suffix_pos_feature(token, 'ed'),
                'end-s': self.suffix_pos_feature(token, 's'),
                'end-ible': self.suffix_pos_feature(token, 'ible'),
            }
    
        def train(self, tagged_sentences: Iterator[Tuple[TokenSeq, PosSeq]]) -> Tuple[NDArray, NDArray]:
            """Trains the classifier on the part-of-speech tagged sentences,
            and returns the feature matrix and label vector on which it was trained.
    
            The feature matrix should have one row per training token. The number
            of columns is up to the implementation, but there must at least be 1
            feature for each token, named "token=T", where "T" is the token string,
            and one feature for the part-of-speech tag of the preceding token,
            named "pos-1=P", where "P" is the part-of-speech tag string, or "<s>" if
            the token was the first in the sentence. For example, if the input is:
    
                What	WP
                's	VBZ
                next	JJ
                ?	.
    
            Then the first row in the feature matrix should have features for
            "token=What" and "pos-1=<s>", the second row in the feature matrix
            should have features for "token='s" and "pos-1=WP", etc. The alignment
            between these feature names and the integer columns of the feature
            matrix is given by the `feature_index` method below.
    
            The label vector should have one entry per training token, and each
            entry should be an integer. The alignment between part-of-speech tag
            strings and the integers in the label vector is given by the
            `label_index` method below.
    
            :param tagged_sentences: An iterator over sentences, where each sentence
            is a tuple of a sequence of tokens and a corresponding sequence of
            part-of-speech tags.
            :return: A tuple of (feature-matrix, label-vector).
            """
            features = []
            labels = []
            for tokens, poses in tagged_sentences:
                features += [self.get_features(token, tokens[i-1] if i > 0 else "", tokens[i+1] if i < len(tokens)-1 else "", poses[i-1] if i > 0 else "<s>", poses[i-2] if i > 1 else "<s>") for i, token in enumerate(tokens)]
                labels += poses
            
            features, labels = self.token_vectorizer.fit_transform(features), self.pos_labeler.fit_transform(labels)
            self.clf.fit(
                features,
                labels
            )
    
            return features, labels
            
    
    
        def feature_index(self, feature: Text) -> int:
            """Returns the column index corresponding to the given named feature.
    
            The `train` method should always be called before this method is called.
    
            :param feature: The string name of a feature.
            :return: The column index of the feature in the feature matrix returned
            by the `train` method.
            """
            return np.where(self.token_vectorizer.get_feature_names_out() == feature)[0].squeeze()
    
        def label_index(self, label: Text) -> int:
            """Returns the integer corresponding to the given part-of-speech tag
    
            The `train` method should always be called before this method is called.
    
            :param label: The part-of-speech tag string.
            :return: The integer for the part-of-speech tag, to be used in the label
            vector returned by the `train` method.
            """
            return self.pos_labeler.transform([label])[0]
    
        def predict(self, tokens: TokenSeq) -> PosSeq:
            """Predicts part-of-speech tags for the sequence of tokens.
    
            This method delegates to either `predict_greedy` or `predict_viterbi`.
            The implementer may decide which one to delegate to.
    
            :param tokens: A sequence of tokens representing a sentence.
            :return: A sequence of part-of-speech tags, one for each token.
            """
            _, pos_tags = self.predict_greedy(tokens)
            # _, _, pos_tags = self.predict_viterbi(tokens)
            return pos_tags
    
        def predict_greedy(self, tokens: TokenSeq) -> Tuple[NDArray, PosSeq]:
            """Predicts part-of-speech tags for the sequence of tokens using a
            greedy algorithm, and returns the feature matrix and predicted tags.
    
            Each part-of-speech tag is predicted one at a time, and each prediction
            is considered a hard decision, that is, when predicting the
            part-of-speech tag for token i, the model will assume that its
            prediction for token i-1 is correct and unchangeable.
    
            The feature matrix should have one row per input token, and be formatted
            in the same way as the feature matrix in `train`.
    
            :param tokens: A sequence of tokens representing a sentence.
            :return: The feature matrix and the sequence of predicted part-of-speech
            tags (one for each input token).
            """
            features = []
            poses = []
            for i, token in enumerate(tokens):
                features.append(self.token_vectorizer.transform(self.get_features(token, tokens[i-1] if i > 0 else "", tokens[i+1] if i < len(tokens)-1 else "", poses[i-1] if i > 0 else "<s>", poses[i-2] if i > 1 else "<s>")))
                poses.append(self.pos_labeler.inverse_transform(self.clf.predict(features[-1]))[0])
    
            return np.array(features).squeeze(), np.array(poses).squeeze()
    
    Share on