Bayesian news classification
from sklearn.datasets import fetch_20newsgroups from sklearn.model_selection import train_test_split
news = fetch_20newsgroups(subset='all') print(news.target_names) print(len(news.data)) print(len(news.target))
print(len(news.target_names))
news.data[0]
print(news.target[0]) print(news.target_names[news.target[0]])
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target) # train = fetch_20newsgroups(subset='train') # x_train = train.data # y_train = train.target # test = fetch_20newsgroups(subset='test') # x_test = test.data # y_test = test.target
from sklearn.feature_extraction.text import CountVectorizer texts=["dog cat fish","dog cat cat","fish bird", 'bird'] cv = CountVectorizer() cv_fit=cv.fit_transform(texts) # print(cv.get_feature_names()) print(cv_fit.toarray()) print(cv_fit.toarray().sum(axis=0))
from sklearn import model_selection from sklearn.naive_bayes import MultinomialNB cv = CountVectorizer() cv_data = cv.fit_transform(x_train) mul_nb = MultinomialNB() scores = model_selection.cross_val_score(mul_nb, cv_data, y_train, cv=3, scoring='accuracy') print("Accuracy: %0.3f" % (scores.mean()))
Tfidfvectorier uses an advanced calculation method called Term Frequency Inverse Document
Frequency (TF-IDF). This is a statistical method to measure the importance of a word in text or corpus. Intuitively, this method seeks words with higher frequency in the current document by comparing the frequency of words in the whole corpus. This is a method to standardize the results, which can avoid the situation that some words appear too frequently and have little effect on the characterization of an instance (I guess, for example, a and and appear more frequently in English, but they have little effect on the characterization of a text)
from sklearn.feature_extraction.text import TfidfVectorizer # Text document list text = ["The quick brown fox jumped over the lazy dog.", "The dog.", "The fox"] # Create transform function vectorizer = TfidfVectorizer() # Entry and vocabulary creation vectorizer.fit(text) # summary print(vectorizer.vocabulary_) print(vectorizer.idf_) # Encoding document vector = vectorizer.transform([text[0]]) # Summary coding document print(vector.shape) print(vector.toarray())
# Create transform function vectorizer = TfidfVectorizer() # Entry and vocabulary creation tfidf_train = vectorizer.fit_transform(x_train) scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv=3, scoring='accuracy') print("Accuracy: %0.3f" % (scores.mean()))
def get_stop_words(): result = set() for line in open('stopwords_en.txt', 'r').readlines(): result.add(line.strip()) return result # Load stop words stop_words = get_stop_words() # Create transform function vectorizer = TfidfVectorizer(stop_words=stop_words) mul_nb = MultinomialNB(alpha=0.01) # Entry and vocabulary creation tfidf_train = vectorizer.fit_transform(x_train) scores = model_selection.cross_val_score(mul_nb, tfidf_train, y_train, cv=3, scoring='accuracy') print("Accuracy: %0.3f" % (scores.mean()))
# Segmentation dataset tfidf_data = vectorizer.fit_transform(news.data) x_train,x_test,y_train,y_test = train_test_split(tfidf_data,news.target) mul_nb.fit(x_train,y_train) print(mul_nb.score(x_train, y_train)) print(mul_nb.score(x_test, y_test))
Bayesian spell checker
Spell checker principle
Among all the correctly spelled words, we want to find a correct word c to maximize the conditional probability for w. Solution:
P(c|w) -> P(w|c) P© / P(w)
For example, appla is the condition w, apple and apply are the correct words c, and P(w) is the same for apple and apply, so we ignore it in the above formula and write it as:
P(w|c) P©
P ©, The probability of the correctly spelled word c in the article, that is, what is the probability of c in the English article.
Assuming that it can be considered that the greater the probability of words appearing in the article, the greater the probability of correct spelling. This quantity can be replaced by the number of words appearing. For example, in English, the probability of the occurrence of the is relatively high, and the probability of the occurrence of P is close to zero (assuming that the latter is also a word)
P(w|c), the probability of typing W when the user wants to type C. This represents the probability that the user will type C wrong into W.
import re
# Read content text = open('big.txt').read()
# Convert to lowercase and keep only a-z characters text = re.findall('[a-z]+', text.lower())
dic_words = {} for t in text: dic_words[t] = dic_words.get(t,0) + 1
dic_words
Edit distance:
The editing distance between two words is defined as the number of operations used to insert (insert a single letter into the word), delete (delete a single letter), exchange (exchange two adjacent letters), and replace (replace one letter with another) from one word to another
# alphabet alphabet = 'abcdefghijklmnopqrstuvwxyz' #Returns all sets with a distance of 1 from word editing def edits1(word): n = len(word) return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion [word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + # transposition [word[0:i]+c+word[i+1:] for i in range(n) for c in alphabet] + # alteration [word[0:i]+c+word[i:] for i in range(n+1) for c in alphabet]) # insertion
apple = 'apple' apple[0:0] + apple[1:]
#Returns all sets with a distance of 2 from word editing #Among these words with editing distance less than 2, only those correct words are regarded as candidates def edits2(word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1))
e1 = edits1('something') e2 = edits2('something') len(e1) + len(e2)
There are 114818 words with a distance of 1 or 2 from something editing
Optimization: only those correct words are used as candidates. After optimization, edits2 can only return three words: 'smoothing', 'something' and 'something'
P(w|c) solution: normally, the probability of spelling one vowel into another is greater than that of consonant (because people often type hello as hallo); The probability of misspelling the first letter of a word is relatively small, and so on. However, for the sake of simplicity, a simple method is selected: the correct word with editing distance of 1 has higher priority than that with editing distance of 2, and the correct word with editing distance of 0 has higher priority than that with editing distance of 1 Generally, it is more likely to type hello as hello than to type hello as hello.
def known(words): w = set() for word in words: if word in dic_words: w.add(word) return w # First calculate the editing distance, and then find the most matching word according to the editing distance def correct(word): # Get candidate words #If known(set) is not empty, candidates will select this set instead of calculating the following candidates = known([word]) or known(edits1(word)) or known(edits2(word)) or word # There are no similar words in the dictionary if word == candidates: return word # Returns the most frequent word max_num = 0 for c in candcidates: if dic_words[c] >= max_num: max_num = dic_words[c] candidate = c return candidate