-
Notifications
You must be signed in to change notification settings - Fork 2
/
getResult.py
88 lines (68 loc) · 2.45 KB
/
getResult.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import json
import time
import boto3
s3_client = boto3.client('s3')
textract_client = boto3.client('textract')
def handler(event, context):
job_id = event['job_id']
print(job_id)
results = getJobResults(job_id)
event['job_status'] = results['JobStatus']
event['job_update_timestamp'] = time.time()
if event['job_status'] != "SUCCEEDED":
# Include the results unless the job is still in progress
# Useful for investigating failures
if event['job_status'] != "IN_PROGRESS":
event['results'] = results
return event
# Job succeeded - retrieve the results
input_bucket = event['bucket_name']
input_object = event['object_name']
output_bucket = input_bucket
output_prefix = 'output/'
output_object_base = os.path.join(output_prefix, os.path.basename(input_object))
event['output_bucket'] = output_bucket
event['raw_results'] = []
page_counter = 0
blocks = []
while True:
page_counter += 1
print(len(blocks))
output_object = f"{output_object_base}.raw.{page_counter:02d}.json"
s3_client.put_object(
Bucket=output_bucket,
Key=output_object,
Body=json.dumps(results),
ServerSideEncryption='AES256',
ContentType='application/json',
)
event['raw_results'].append(output_object)
print(f"Result {page_counter:02} saved to: s3://{output_bucket}/{output_object}")
if 'Blocks' in results:
blocks.extend(results['Blocks'])
if 'NextToken' not in results:
break
print(f"NextToken: {results['NextToken']}")
results = getJobResults(job_id, next_token=results['NextToken'])
print(len(blocks))
# Save merged 'Blocks' into a separate file for ease of use
output_object = f"{output_object_base}.blocks.json"
s3_client.put_object(
Bucket=output_bucket,
Key=output_object,
Body=json.dumps({'Blocks': blocks}),
ServerSideEncryption='AES256',
ContentType='application/json',
)
print(f"Blocks file saved to: s3://{output_bucket}/{output_object}")
event['blocks'] = output_object
return event
def getJobResults(job_id, next_token = None):
kwargs = {}
if next_token:
kwargs['NextToken'] = next_token
print("kwargs")
print(kwargs)
response = textract_client.get_document_analysis(JobId=job_id, **kwargs)
return response