Lowercase & Noise removal¶

import re

def cleanup(txt):
    txt = re.sub('@\w+',' ', txt)             # remove @mentions
    txt = re.sub('\w+:\/\/\S+', ' ', txt)     # remove link://...
    txt = re.sub('[^0-9a-zA-Z \t]', ' ', txt) # remove punctuation
    return txt.strip().lower()

txt = '@Craig_Spur @kev_g1 @davspurs We have put a number now on every single head within the Covid world which makes people in the public feel that every loss is personal to them and I do get it but you think people are gonna go out an exercise when we tell them 17m die globally a year through crap lifestyles?'
print(txt)

@Craig_Spur @kev_g1 @davspurs We have put a number now on every single head within the Covid world which makes people in the public feel that every loss is personal to them and I do get it but you think people are gonna go out an exercise when we tell them 17m die globally a year through crap lifestyles?

print(cleanup(txt))

we have put a number now on every single head within the covid world which makes people in the public feel that every loss is personal to them and i do get it but you think people are gonna go out an exercise when we tell them 17m die globally a year through crap lifestyles

Tokenization¶

from nltk.tokenize import word_tokenize, sent_tokenize

txt = 'An electrocardiogram is used to record the electrical conduction through a person\'s heart. The readings can be used to diagnose cardiac arrhythmias.'
print(txt)

An electrocardiogram is used to record the electrical conduction through a person's heart. The readings can be used to diagnose cardiac arrhythmias.

# Split on spaces
print(txt.split(' '))

['An', 'electrocardiogram', 'is', 'used', 'to', 'record', 'the', 'electrical', 'conduction', 'through', 'a', "person's", 'heart.', 'The', 'readings', 'can', 'be', 'used', 'to', 'diagnose', 'cardiac', 'arrhythmias.']

# Split on words
print(word_tokenize(txt))

['An', 'electrocardiogram', 'is', 'used', 'to', 'record', 'the', 'electrical', 'conduction', 'through', 'a', 'person', "'s", 'heart', '.', 'The', 'readings', 'can', 'be', 'used', 'to', 'diagnose', 'cardiac', 'arrhythmias', '.']

# Split on sentences
print(sent_tokenize(txt))

["An electrocardiogram is used to record the electrical conduction through a person's heart.", 'The readings can be used to diagnose cardiac arrhythmias.']

Stopwords removal¶

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stopwords_eng = set(stopwords.words('english'))
len(stopwords_eng)

179

stopwords_eng

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

txt = "NBC was founded in 1926 making it the oldest major broadcast network in the USA"
txt

'NBC was founded in 1926 making it the oldest major broadcast network in the USA'

" ".join([w for w in word_tokenize(txt)
            if w not in stopwords_eng])

'NBC founded 1926 making oldest major broadcast network USA'

Spelling correction¶

https://pyspellchecker.readthedocs.io/en/latest/code.html

!pip install spellchecker pyspellchecker

from spellchecker import SpellChecker

txt = 'The qiuck brown fox jmps over the lazy dog'
txt

'The qiuck brown fox jmps over the lazy dog'

spell = SpellChecker()

" ".join([spell.correction(w)
            for w in word_tokenize(txt)])

'The quick brown fox mps over the lazy dog'

Stemming¶

from nltk.stem import PorterStemmer

txt = "NBC was founded in 1926 making it the oldest major broadcast network in the USA"
txt

'NBC was founded in 1926 making it the oldest major broadcast network in the USA'

stemmer = PorterStemmer()

" ".join([stemmer.stem(w)
            for w in word_tokenize(txt)])

'nbc wa found in 1926 make it the oldest major broadcast network in the usa'

Lemmatization¶

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

" ".join([lemmatizer.lemmatize(w)
            for w in word_tokenize(txt)])

'NBC wa founded in 1926 making it the oldest major broadcast network in the USA'

Lemmatization with Part-of-speech¶

from nltk import pos_tag

pos_tag(word_tokenize(txt))

[('NBC', 'NNP'),
 ('was', 'VBD'),
 ('founded', 'VBN'),
 ('in', 'IN'),
 ('1926', 'CD'),
 ('making', 'VBG'),
 ('it', 'PRP'),
 ('the', 'DT'),
 ('oldest', 'JJS'),
 ('major', 'JJ'),
 ('broadcast', 'NN'),
 ('network', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('USA', 'NNP')]

Part-of-speed tags of pos_tag() are different than WordNet's, so we need to convert them

treebank_wordnet_pos = {
    'J': 'a', # adjective
    'V': 'v', # verb
    'N': 'n', # noun
    'R': 'r', # adverb
}

def get_wordnet_pos(treebank_pos, default='n'):
    return treebank_wordnet_pos.get(treebank_pos[0], default)

" ".join([lemmatizer.lemmatize(w[0], get_wordnet_pos(w[1]))
            for w in pos_tag(word_tokenize(txt))])

'NBC be found in 1926 make it the old major broadcast network in the USA'

Synonyms¶

from nltk.corpus import wordnet as wn

synonyms = wn.synsets('nonetheless')

if len(synonyms) != 0:
    syn = synonyms[0]

    print(syn.pos(), syn._lemma_names[0])

r however