I have successfully trained and tested a Support Vector classifier model to classify each row based on title and abstract, by using two user defined function (UDF). First UDF is for preprocessing of the data and second UDF is for the model building. To create the model i had used df1 which was already classified before
I am stuck on how to implement this trained model to new set of dataframe say df2 which is not classified. Any suggestion or help would be welcome.
See the user defined function for preprocessing and model building below
def preprocessing(col,h_pct=1,l_pct=1):
#Lower case
lower = col.apply(str.lower)
#Stemming
from nltk.stem import SnowballStemmer
stem = SnowballStemmer('english')
stemmed = lower.apply(lambda x: ' '.join(stem.stem(word) for word in str(x).split()))
#removing punctuation
import re
rem_punc = stemmed.apply(lambda x: re.sub(r'[^\w\s]',' ',x))
#removing stopwords and extra spaces
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
rem_stopwords = rem_punc.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
#removing numbers
rem_num = rem_stopwords.apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
#remove words having length=1
rem_lngth1 = rem_num.apply(lambda x: re.sub(r'[^\w\s]',' ',x))
if h_pct != 0:
#removing the top $h_pct of the most frequent words
high_freq = pd.Series(' '.join(rem_lngth1).split()).value_counts()[:int(pd.Series(' '.join(rem_lngth1).split()).count()*h_pct/100)]
rem_high = rem_lngth1.apply(lambda x: " ".join(x for x in x.split() if x not in high_freq))
else:
rem_high = rem_lngth1
if l_pct != 0:
#removing the top $l_pct of the least frequent words
low_freq = pd.Series(' '.join(rem_high).split()).value_counts()[:-int(pd.Series(' '.join(rem_high).split()).count()*l_pct/100):-1]
rem_low = rem_high.apply(lambda x: " ".join(x for x in x.split() if x not in low_freq))
else:
rem_low = rem_high
return rem_low
def prep_fit_pred(df, h_pct, l_pct, model, verbose=False):
df['new_Abstract'] = preprocessing(df['Abstract'],h_pct,l_pct)
df['concat'] = df['Title'] + '\n' + df['new_Abstract']
#not removing high and low frequency words from headline
#this is because the headline carries more significance in determining the classification of the news
df['concat_processed'] = preprocessing(df['concat'],0,0)
X = df['concat_processed']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,
stratify=y)
bow_xtrain = bow.fit_transform(X_train)
bow_xtest = bow.transform(X_test)
model.fit(bow_xtrain,y_train)
preds = model.predict(bow_xtest)
acc = accuracy_score(y_test,preds)*100
return preds, acc, model
Read more here: https://stackoverflow.com/questions/67010708/how-to-classify-records-using-already-trained-model
Content Attribution
This content was originally published by Arun Menon at Recent Questions - Stack Overflow, and is syndicated here via their RSS feed. You can read the original post over there.