-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanText.py
66 lines (53 loc) · 1.68 KB
/
cleanText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
'''
cleanText.py
This file contains functions that extract text.py uses to clean text for matching with lexicon.
Only basic cleaning is done, striping diacritics from the text and striping special characters and numbers with re.
'''
import re
import unicodedata
def strip_accents(text):
"""
Strip accents from input String.
:param text: The input string.
:type text: String.
:returns: The processed String.
:rtype: String.
"""
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError): # unicode is a default on python 3
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)
def strip_digitsAndSpecialChars(text):
"""
Convert input text to id.
:param text: The input string.
:type text: String.
:returns: The processed String.
:rtype: String.
"""
text = re.sub('[ ]+', ' ', text)
#substitute values that aren't letters,numbers,underscore or dot
text = re.sub('[^0-9a-zA-Z_.]', ' ', text)
# seperate dot from text by adding whitespace to it
text = re.sub('[.]', ' . ', text)
#substitute digits with whitespace
text = re.sub('[0-9]', ' ', text)
#substitute single letter words with white_space
text = re.sub(r'(?:^| )\w(?:$| )', ' ', text)
return text
def clean_text(text):
"""
Applies all the filters to input text.
:param text: The input string.
:type text: String.
:returns: The processed String.
:return type: String.
"""
text = text.lower()
text = strip_accents(text)
text = strip_digitsAndSpecialChars(text)
return text