I have successfully trained and tested a Support Vector classifier model to classify each row based on title and abstract, by using two user defined function (UDF). First UDF is for preprocessing of the data and second UDF is for the model building. To create the model i had used df1 which was already classified before
I am stuck on how to implement this trained model to new set of dataframe say df2 which is not classified. Any suggestion or help would be welcome.
See the user defined function for preprocessing and model building below
def preprocessing(col,h_pct=1,l_pct=1): #Lower case lower = col.apply(str.lower) #Stemming from nltk.stem import SnowballStemmer stem = SnowballStemmer('english') stemmed = lower.apply(lambda x: ' '.join(stem.stem(word) for word in str(x).split())) #removing punctuation import re rem_punc = stemmed.apply(lambda x: re.sub(r'[^\w\s]',' ',x)) #removing stopwords and extra spaces from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) rem_stopwords = rem_punc.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words)) #removing numbers rem_num = rem_stopwords.apply(lambda x: " ".join(x for x in x.split() if not x.isdigit())) #remove words having length=1 rem_lngth1 = rem_num.apply(lambda x: re.sub(r'[^\w\s]',' ',x)) if h_pct != 0: #removing the top $h_pct of the most frequent words high_freq = pd.Series(' '.join(rem_lngth1).split()).value_counts()[:int(pd.Series(' '.join(rem_lngth1).split()).count()*h_pct/100)] rem_high = rem_lngth1.apply(lambda x: " ".join(x for x in x.split() if x not in high_freq)) else: rem_high = rem_lngth1 if l_pct != 0: #removing the top $l_pct of the least frequent words low_freq = pd.Series(' '.join(rem_high).split()).value_counts()[:-int(pd.Series(' '.join(rem_high).split()).count()*l_pct/100):-1] rem_low = rem_high.apply(lambda x: " ".join(x for x in x.split() if x not in low_freq)) else: rem_low = rem_high return rem_low def prep_fit_pred(df, h_pct, l_pct, model, verbose=False): df['new_Abstract'] = preprocessing(df['Abstract'],h_pct,l_pct) df['concat'] = df['Title'] + '\n' + df['new_Abstract'] #not removing high and low frequency words from headline #this is because the headline carries more significance in determining the classification of the news df['concat_processed'] = preprocessing(df['concat'],0,0) X = df['concat_processed'] y = df['Category'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y) bow_xtrain = bow.fit_transform(X_train) bow_xtest = bow.transform(X_test) model.fit(bow_xtrain,y_train) preds = model.predict(bow_xtest) acc = accuracy_score(y_test,preds)*100 return preds, acc, model