-
Notifications
You must be signed in to change notification settings - Fork 0
/
only_Eng_or_Chinese.py
81 lines (71 loc) · 2.39 KB
/
only_Eng_or_Chinese.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
def is_ustr(in_str, chinese=True, ASCII=True, numbers=True,
special_characters=True, blanks=True,
ASCII_punctuation=True, chinese_punctuation=True):
'''
keep only characters allowed in the is_uchar() method
:param in_str: string
:return: string with only allowed characters
'''
out_str = ''
for i in range(len(in_str)):
if is_uchar(in_str[i], chinese, ASCII, numbers,
special_characters, blanks,
ASCII_punctuation, chinese_punctuation):
out_str = out_str + in_str[i]
else:
out_str = out_str + ''
return out_str
def is_uchar(uchar, chinese=True, ASCII=True, numbers=True,
special_characters=True, blanks=True,
ASCII_punctuation=True, chinese_punctuation=True):
'''
判断字符是否符合规则
:param uchar: a character
:return: passing the rules or not
'''
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return chinese
"""判断一个unicode是否是数字"""
if uchar >= u'\u0030' and uchar <= u'\u0039':
return numbers
"""判断一个unicode是否是英文字母"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return ASCII
if uchar in ('-', ',', '.', '>', '<', '?'):
return ASCII_punctuation
if uchar in (',', '。', '《', '》'):
return chinese_punctuation
'''判断空格 " " '''
if uchar == " ":
return blanks
if uchar in ('\n', '\t', "\"", "\'"):
return special_characters
return False
def delete_empty_line(contents):
'''
find adjacent new line characters and delete one of them
:param contents: file contents
:return: contents in string
'''
str = ""
count = 0
for char in contents:
if char == "\n":
count += 1
if count == 2:
count = 0
continue
str += char
return str
if __name__ == "__main__":
str = ""
with open("paris_lyrics.txt", mode="r") as r:
lines = r.readlines()
for line in lines:
str += is_ustr(line, chinese=False, chinese_punctuation=False)
# with open("temp.txt", mode="r") as r:
# lines = r.read()
# str = delete_empty_line(lines)
print(str)