How to classify records using already trained model?

I have successfully trained and tested a Support Vector classifier model to classify each row based on title and abstract, by using two user defined function (UDF). First UDF is for preprocessing of the data and second UDF is for the model building. To create the model i had used df1 which was already classified before

I am stuck on how to implement this trained model to new set of dataframe say df2 which is not classified. Any suggestion or help would be welcome.

See the user defined function for preprocessing and model building below

def preprocessing(col,h_pct=1,l_pct=1):
      #Lower case
    lower = col.apply(str.lower)
    from nltk.stem import SnowballStemmer
    stem = SnowballStemmer('english')
    stemmed = lower.apply(lambda x: ' '.join(stem.stem(word) for word in str(x).split()))
    #removing punctuation
    import re
    rem_punc = stemmed.apply(lambda x: re.sub(r'[^\w\s]',' ',x))
#removing stopwords and extra spaces

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
rem_stopwords = rem_punc.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
    #removing numbers
    rem_num = rem_stopwords.apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
    #remove words having length=1
    rem_lngth1 = rem_num.apply(lambda x: re.sub(r'[^\w\s]',' ',x))
    if h_pct != 0:
        #removing the top $h_pct of the most frequent words 
        high_freq = pd.Series(' '.join(rem_lngth1).split()).value_counts()[:int(pd.Series(' '.join(rem_lngth1).split()).count()*h_pct/100)]
        rem_high = rem_lngth1.apply(lambda x: " ".join(x for x in x.split() if x not in high_freq))
        rem_high = rem_lngth1
    if l_pct != 0:
        #removing the top $l_pct of the least frequent words
        low_freq = pd.Series(' '.join(rem_high).split()).value_counts()[:-int(pd.Series(' '.join(rem_high).split()).count()*l_pct/100):-1]
        rem_low = rem_high.apply(lambda x: " ".join(x for x in x.split() if x not in low_freq))
        rem_low = rem_high
    return rem_low

def prep_fit_pred(df, h_pct, l_pct, model, verbose=False):
    df['new_Abstract'] = preprocessing(df['Abstract'],h_pct,l_pct)
    df['concat'] = df['Title'] + '\n' + df['new_Abstract']
    #not removing high and low frequency words from headline
    #this is because the headline carries more significance in determining the classification of the news
    df['concat_processed'] = preprocessing(df['concat'],0,0)

    X = df['concat_processed']
    y = df['Category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, 
    bow_xtrain = bow.fit_transform(X_train)
    bow_xtest = bow.transform(X_test),y_train)
    preds = model.predict(bow_xtest)

    acc = accuracy_score(y_test,preds)*100
    return preds, acc, model

Read more here:

Content Attribution

This content was originally published by Arun Menon at Recent Questions - Stack Overflow, and is syndicated here via their RSS feed. You can read the original post over there.

%d bloggers like this: