from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
txt = 'We are so grateful to you for having killed the Wicked Witch of the East, and for setting our people free from bondage.'
txt_pos = pos_tag(word_tokenize(txt.lower()))
txt_pos
from nltk import RegexpParser
# Noun phrases
# NP is a user-defined name for the chunks we're looking for
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = RegexpParser(chunk_grammar)
chunks = chunk_parser.parse(txt_pos)
chunks
# Double-click on the image to see it fully
for subtree in chunks.subtrees():
if subtree.label() == 'NP':
print(tuple(subtree))
trees = [" ".join([w[0] for w in tree.leaves()])
for tree in chunks.subtrees()
if tree.label() == 'NP']
trees
!wget http://dev.gutenberg.org/files/55/55.txt -O oz.txt
txt = open('oz.txt', 'r').read()
k = txt.find('*** START')
if k != -1:
k = txt.find('\n', k)
txt = txt[k:]
k = txt.find('*** END')
if k != -1:
txt = txt[:k]
sentences = sent_tokenize(txt)
len(sentences)
from collections import Counter
import re
re_punc = re.compile('[^0-9a-zA-Z .]')
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = RegexpParser(chunk_grammar)
chunk_counter = Counter()
for sentence in sentences:
txt = re.sub(re_punc, ' ', sentence.lower())
txt_pos = pos_tag(word_tokenize(txt))
chunks = chunk_parser.parse(txt_pos)
chunk_counter.update([
" ".join([w[0] for w in subtree.leaves()])
for subtree in chunks.subtrees()
if subtree.label() == 'NP'])
chunk_counter.most_common(30)
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk import ne_chunk
txt = 'Prime Minister Narendra Modi on Tuesday announced the 20 Lakh Crore package for the India to fight against the coronavirus pandemic.'
txt
txt_pos = pos_tag(word_tokenize(txt))
res = ne_chunk(txt_pos, binary=False)
print(res)
!python -m spacy download en_core_web_sm
import spacy
nlp = spacy.load('en_core_web_sm')
txt
doc = nlp(txt)
for ent in doc.ents:
print(ent.text, ent.label_)
spacy.displacy.render(doc, style='ent')
from nltk import Tree
def to_nltk_tree(node):
if node.n_lefts + node.n_rights > 0:
parsed_child_nodes = [to_nltk_tree(child) for child in node.children]
return Tree(node.orth_, parsed_child_nodes)
else:
return node.orth_
for sent in doc.sents:
to_nltk_tree(sent.root).pretty_print()
for sent in doc.sents:
display(to_nltk_tree(sent.root))