This repository has been archived by the owner on Mar 30, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
nlp.py
57 lines (48 loc) · 1.81 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Copyright 2016 Google Inc. All Rights Reserved.
import argparse
import sys
import textwrap
import googleapiclient.discovery
def analyze_syntax(text):
"""Use the NL API to analyze the given text string, and returns the
response from the API. Requests an encodingType that matches
the encoding used natively by Python. Raises an
errors.HTTPError if there is a connection problem.
"""
service = googleapiclient.discovery.build('language', 'v1beta1')
body = {
'document': {
'type': 'PLAIN_TEXT',
'content': text,
},
'features': {
'extract_syntax': True,
},
'encodingType': get_native_encoding_type(),
}
request = service.documents().annotateText(body=body)
return request.execute()
def get_native_encoding_type():
"""Returns the encoding type that matches Python's native strings."""
if sys.maxunicode == 65535:
return 'UTF16'
else:
return 'UTF32'
def find_verb_noun(tokens):
to_be_list = ['be', 'do']
verb_list = []
noun_list = []
verb_noun_string = ""
for head, token in enumerate(tokens):
if token['partOfSpeech']['tag'] == 'VERB' and token['dependencyEdge']['label'] != 'NSUBJ' and token['lemma'] not in to_be_list:
verb_list.append(token['text']['content'])
verb_noun_string+= " "+token['text']['content']
if token['partOfSpeech']['tag'] == 'NOUN':
noun_list.append(token['text']['content'])
verb_noun_string+= " "+token['text']['content']
return verb_list, noun_list,verb_noun_string.strip()
def main(text):
analysis = analyze_syntax(text)
tokens = analysis.get('tokens', [])
verb_list, noun_list, verb_noun_string = find_verb_noun(tokens)
return verb_noun_string, verb_list, noun_list