还可以构建一些额外的基于文本的的特征,这些特征有时有助于提升文本分类模型性能。一些例子如下:
文档的词计数—文档中词总数
文档的字符计数—文档中字符总数
文档的平均词密度—文档中词的平均长度
整篇文章中的标点符号计数—文档中标点符号的总数
整篇文章中大写词计数—文档中大写词的总数
整篇文章中标题词计数—文档中合适的大小写(标题)词总数
词性标签的频率分布:
名词计数
动词计数
形容词计数
副词计数
代词计数
这些特征是实验性质的,只能根据特定的情况使用。
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in xif _ in string.punctuation)))
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
pos_family = {
'noun' : ['NN','NNS','NNP','NNPS'],
'pron' : ['PRP','PRP$','WP','WP$'],
'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
'adj' : ['JJ','JJR','JJS'],
'adv' : ['RB','RBR','RBS','WRB']
}
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
cnt = 0
try:
wiki = textblob.TextBlob(x)
for tup in wiki.tags:
ppo = list(tup)[1]
if ppo in pos_family[flag]:
cnt += 1
except:
pass
return cnt
trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))