Linguistic dictionaries are the correct way to check for spelling errors in documents using NLP techniques in Python. How? Easily create a comparison function using the module RecordLinkage or write a function using Jaccard's similarity equation. But this is another time.
Please import:
%%time
%matplotlib inline
from matplotlib import pyplot as plt
import time
import re, random
import random
import string
import sys, types, os
import numpy as np
import pandas as pd
from textblob import Word
from nltk.tag import pos_tag
from nltk import word_tokenize
from textblob.taggers import PatternTagger
from textblob.decorators import requires_nltk_corpus
from textblob.utils import tree2str, filter_insignificant
from textblob.base import BaseNPExtractor
from textblob.wordnet import VERB
from textblob import Word
from spacy import displacy
import nltk
To generate a word list:
from nltk.corpus import words
word_list = words.words()
#Save the change file. Save the file to your desktop. This is the best way to save checkpoints when you change a document in Python.
word_list.to_csv(r’C:\Users\XXXXXXXX\Desktop\dictwordslang.csv’, index = False, header=True)
Save the file to your desktop, and then upload the file to a Jupyter notebook or Jupyter lab. Import the word list into your Python code. Prepare a word list. Text processing, tokens, lemmatization, positions, tags, dep, alpha, and stop word creation.
Word Tokens: The process of segmenting text into words, punctuation marks, etc.
Word Lemmatization:It is the process of grouping the inflection types of words to be analyzed, identified by word titles or dictionary forms
Word Position: The process of categorization combines a group of words into parts of speech
Word Tags: It is the process of assigning linguistic information to the word
Word Dependency: The process of assigning syntactic dependency labels that describe the relationships between individual tokens, such as subjects and objects
Word Alpha: The process of identifying whether a word is alpha or not
Word Stop: For example, the process of identifying the stop word (which is not)
You can do this Spacy:
%%time
tokens = []
lemma = []
pos = []
tag = []
dep = []
alpha = []
stop = []
for doc in nlp.pipe(wordlist[‘words’].astype(‘unicode’).values, batch_size=100, n_threads=4):
if doc.is_parsed:
tokens.append([n.text for n in doc])
lemma.append([n.lemma_ for n in doc])
pos.append([n.pos_ for n in doc])
tag.append([n.tag_ for n in doc])
dep.append([n.dep_ for n in doc])
alpha.append([n.is_alpha for n in doc])
stop.append([n.is_stop for n in doc])
else:
#We want to make sure that the list of parsed results is the same number as the number of entries in the original dataframe, so we add blanks in case the parse fails#
tokens.append(None)
lemma.append(None)
pos.append(None)
tag.append(None)
dep.append(None)
alpha.append(None)
stop.append(None)
wordlist[‘tokens’] = tokens
wordlist[‘lemma’] = lemma
wordlist[‘pos’] = pos
wordlist[‘tag’] = tag
wordlist[‘dep’] = dep
wordlist[‘alpha’] = alpha
wordlist[‘stop’] = stop
This takes 1 minute and 40s to complete. Note: If you use this code to analyze a document, it will take some time.
Sentiment in the word:
Understand whether a word is negative, positive, or neutral
wordlist[[‘polarity’, ‘subjectivity’]] = wordlist[‘words’].apply(lambda words: pd. Series(TextBlob(words).sentiment))
tokens.append(None)
lemma.append(None)
pos.append(None)
tag.append(None)
dep.append(None)
alpha.append(None)
stop.append(None)
wordlist[‘tokens’] = tokens
wordlist[‘lemma’] = lemma
wordlist[‘pos’] = pos
wordlist[‘tag’] = tag
wordlist[‘dep’] = dep
wordlist[‘alpha’] = alpha
wordlist[‘stop’] = stop
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.util import ngrams
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
from nltk import trigrams
nltk.download(‘punkt’)
nltk.download(‘wordnet’)
stop = stopwords.words(‘english’)
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.language import Language
from spacy.pipeline import EntityRuler
ruler = EntityRuler(nlp)
nlp.add_pipe(ruler)
from translate import Translator
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from autocorrect import Speller
spell = Speller(lang=’en’)
from textblob import TextBlob
import spacy
