-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdf-ai-info-processing.py
112 lines (92 loc) · 3.62 KB
/
pdf-ai-info-processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from PyPDF2 import PdfReader
import openai
import csv
import sys
import os
import re
import json
import subprocess
def openai_query(prompt):
response = openai.Completion.create(
model="text-davinci-002",
prompt=prompt,
max_tokens=1500,
temperature=0.3
)
return response['choices'][0]['text']
def query(prompt_suffix):
prompt_prefix = "Can you give me an answer with the structure of csv with the core ideas of this text in answer/question form? Only the CSV, do not add any other information as explanation or conclusion. Also, put it in plain text. Do not add a confirmation as 'sure' neither say 'in this example'. Also, at the beginning, do not add 'Question,Answer'. Also when you want to add a comma to the question or the answer which is part of the string substitute it by '|'"
response_text = openai_query(prompt_prefix + prompt_suffix)
print(response_text)
return response_text
def replace_commas_after_question(s):
result = []
split_by_question = s.split("?")
for i, segment in enumerate(split_by_question):
if i > 0:
first_comma_idx = segment.find(",")
if first_comma_idx != -1:
segment = (
segment[: first_comma_idx + 1]
+ segment[first_comma_idx + 1 :].replace(",", "|")
)
result.append(segment)
return "?".join(result)
def generate_response_text(num_pages):
response_text_end = ""
prompt_suffix=""
for i in range(num_pages):
page = reader.pages[i]
text = page.extract_text()
print("here")
if len(prompt_suffix + text)+448 < 4096:
prompt_suffix += f"\n{text}"
else:
print(f"{i}/{num_pages}")
response_text_end+=query(prompt_suffix)
prompt_suffix = f"\n{text}"
response_text_end=replace_commas_after_question(response_text_end)
return response_text_end
with open("key.json") as f:
data = json.load(f)
openai.api_key = data["openai_api_key"]
with open("anki_tag.json") as f:
data = json.load(f)
tag = data["tag"]
pdf_files_path = './pdf_files'
if '-a' in sys.argv:
pdf_files = [f for f in os.listdir(pdf_files_path) if f.endswith('.pdf')]
elif '-f' in sys.argv:
# Get filenames from command line argument
filenames = sys.argv[sys.argv.index('-f')+1:]
pdf_files = [f for f in filenames if f.endswith('.pdf')]
else:
print("Error: Please specify either -a to process all PDF files in the directory, or -f followed by a list of filenames to process.")
exit(1)
for filename in os.listdir(pdf_files_path):
pdf_path = os.path.join(pdf_files_path, filename)
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
final_answer = generate_response_text(num_pages)
lines = final_answer.split('\n')
failed_lines = []
for line in lines:
if not re.match('.*,.*', line):
failed_lines.append(line)
final_answer = '\n'.join([line for line in lines if re.match('.*,.*', line)])
failed_lines = '\n'.join(failed_lines)
print(final_answer)
print("//////////")
print(failed_lines)
print("//////////")
custom_filename = f"{tag}_"+os.path.splitext(filename)[0]
csv_path = os.path.join('out_csv', custom_filename)
csv_name = f"{csv_path}.csv"
fail_path = os.path.join('failed', custom_filename)
fail_name = f"{fail_path}.csv"
print(csv_name)
with open(csv_name, 'w') as file:
file.write(final_answer)
with open(fail_name + ".failed", 'w') as file:
file.write(failed_lines)
subprocess.call("./clean_csv.sh")