generated from mintlify/starter
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
started prompt dataset generation from public data, for the ai educat…
…ion assistant case study.
- Loading branch information
Showing
13 changed files
with
219 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import json | ||
import openai | ||
import jsonlines | ||
from pypdf import PdfReader | ||
from pydantic import BaseModel | ||
# noinspection PyProtectedMember | ||
from openai.lib._parsing._completions import type_to_response_format_param | ||
|
||
|
||
def generate_dataset_from_past_paper(paper_fpath: str, dataset_fpath: str) -> None: | ||
|
||
assert dataset_fpath[-6:] == ".jsonl", \ | ||
"dataset filepath must be a .jsonl file, " \ | ||
"but filepath provides is: {}".format(dataset_fpath) | ||
|
||
pdf_content = "".join(["\nPAGE {}\n".format(i) + page.extract_text() | ||
for i, page in enumerate(PdfReader(paper_fpath).pages)]) | ||
client = openai.OpenAI() | ||
|
||
class NumberOfQuestions(BaseModel): | ||
value: int | ||
|
||
response_format = type_to_response_format_param(NumberOfQuestions) | ||
|
||
system_prompt =\ | ||
"You will be given a maths exam paper and a detailed mark scheme, which have " \ | ||
"been extracted from a PDF. Your task is to determine the total number of " \ | ||
"questions in the paper, and return this number. The questions are " \ | ||
"explicitly numbered in the paper, and so counting should not be needed, " \ | ||
"simply read the number corresponding to the final question, and return the " \ | ||
"value as a number (not a word), and nothing else." | ||
# noinspection PyTypeChecker | ||
response = client.chat.completions.create( | ||
model="gpt-4o-2024-08-06", | ||
messages=[ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": pdf_content} | ||
], | ||
response_format=response_format | ||
) | ||
num_questions = json.loads(response.choices[0].message.content)["value"] | ||
|
||
print("paper contains {} questions".format(num_questions)) | ||
|
||
class QuestionAnswerPair(BaseModel): | ||
full_question: str | ||
full_answer: str | ||
answerable: bool | ||
|
||
response_format = type_to_response_format_param(QuestionAnswerPair) | ||
|
||
qna_pairs = list() | ||
for i in range(1, num_questions+1): | ||
system_prompt =\ | ||
"You will be given a maths exam paper and a detailed mark scheme, which " \ | ||
"have been extracted from a PDF. Your task is to extract question {} in a " \ | ||
"totally unmodified manner and also the corresponding answer from the mark " \ | ||
"scheme for question {} only, also in a fully unmodified and exhaustive manner," \ | ||
"as per the json format requested. The question will appear in the earlier pages of the " \ | ||
"document, and the answer will appear in the later pages of the " \ | ||
"document in the mark scheme section, with the question numbers aligned to each answer in the mark scheme. " \ | ||
"Some questions might not be answerable in a purely text-based manner." \ | ||
"For example the question might require drawing on a chart or graph," \ | ||
"or the question might refers to some shape, image, table or chart. " \ | ||
"Otherwise, the question text might simply seem incomplete in some " \ | ||
"way, where the question could not be answered based on the question text alone. " \ | ||
"In such cases, you should set the answerable response as " \ | ||
"False, otherwise if it CAN be answered based on the purely text-based question, " \ | ||
"then answerable should be set as True.".format(i, i) | ||
# noinspection PyTypeChecker | ||
response = client.chat.completions.create( | ||
model="gpt-4o-2024-08-06", | ||
messages=[ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": pdf_content} | ||
], | ||
response_format=response_format | ||
) | ||
response = json.loads(response.choices[0].message.content) | ||
print("\nQUESTION {}".format(i)) | ||
if not response["answerable"]: | ||
print("\nSKIPPING:\nQ: {}".format(response["full_question"])) | ||
print("--------") | ||
continue | ||
del response["answerable"] | ||
print("\nADDING:\nQ: {}\nA: {}".format( | ||
response["full_question"], response["full_answer"])) | ||
print("--------") | ||
qna_pairs.append(response) | ||
|
||
with jsonlines.open(dataset_fpath, mode='w') as writer: | ||
writer.write_all(qna_pairs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# global | ||
import os | ||
|
||
# local | ||
from case_studies.education_assistant.dataset_generation import \ | ||
generate_dataset_from_past_paper | ||
|
||
this_dir = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
maths_paper_fname = \ | ||
"resources/maths/169001-higher-tier-sample-assessment-materials_{}.pdf" | ||
|
||
maths_paper_fpaths = [os.path.join(this_dir, maths_paper_fname.format(char)) | ||
for char in ("a", "b", "c")] | ||
maths_dataset_fpaths = [fpath.replace(".pdf", ".jsonl") for fpath in maths_paper_fpaths] | ||
for maths_paper_fpath, maths_dataset_fpath in zip(maths_paper_fpaths, maths_dataset_fpaths): | ||
generate_dataset_from_past_paper(maths_paper_fpath, maths_dataset_fpath) |
15 changes: 15 additions & 0 deletions
15
...ducation_assistant/resources/maths/169001-higher-tier-sample-assessment-materials_a.jsonl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{"full_question": "1 18 rice cakes weigh a total of 130 g.\n There are 329 calories in 100 g of rice cakes.\n How many calories are there in one rice cake?\n..................... calories [3]", "full_answer": "23.6 – 23.8 Accept 24 provided full method shown 3 1 AO1.3b 2 AO3.1c M2 for 329×13018×100 Or M1 for any two of 329100 or 130100 or 32918 or 329 130 May be done in stages"} | ||
{"full_question": "2 A circular table top has radius 70 cm. (a) Calculate the area of the table top in cm2, giving your answer as a multiple of r. (a) ....................... cm2 [2] (b) The volume of the table top is 17 150r cm3. Calculate the thickness of the table top. (b) ........................ cm [2]", "full_answer": "2 (a) 4900π 2 1 AO 1.2 1 AO1.3a M1 for π 702 may be implied by 15393.8… (b) 3.5 2 2 AO1.3a M1 for ππ‘4907001 15’their FT from (a), provided (a) is a multiple of π"} | ||
{"full_question": "3 The value of a car £ V is given by\nV = 20 000 × 0.9t\n where t is the age of the car in complete years.\n (a) Write down the value of V when t = 0.\n(a) £ ........................... [1]\n (b) What is the value of V when t = 3?\n(b) £ ........................... [2]\n (c) After how many complete years will the car’s value drop below £10 000?\n(c) .............................. [2]", "full_answer": "3 (a) £20 000 1 1 AO1.3a \n(b) £14 580 or £14 600 2 2 AO1.3a M1 for 20 000 × 0.93 \n(c) 7 years 2 1 AO1.3a 1 AO3.1c M1 for 2 trials shown"} | ||
{"full_question": "Kieran, Jermaine and Chris play football.\\n • Kieran has scored 8 more goals than Chris.\\n • Jermaine has scored 5 more goals than Kieran.\\n • Altogether they have scored 72 goals.\\n How many goals did they each score?\\nKieran ......................\\nJermaine ......................\\nChris ......................\\n[5]", "full_answer": "25, 30, 17\\n\\nM1 for any two consistent expressions, e.g. x – 8, x \\nM1 for x − 8 + x + x + 5 = 72 oe \\nA1 for x = 25 \\nB1 for Kieran 25 or Jermaine 30 or Chris 17 \\nAccept equivalent correct equations"} | ||
{"full_question": "6 Peter makes a large amount of pink paint by mixing red and white paint in the ratio 2 : 3. Red paint costs £80 per 10 litres. White paint costs £5 per 10 litres. Peter sells his pink paint in 10-litre tins for £60 per tin. Calculate how much profit he makes for each tin he sells. £ ..................................... [5]", "full_answer": "£25 \n\n5 \n2 AO1.3b \n3 AO3.1d \nM1 for 10 × 25 = 4 litres red or 10 × 35 = 6 litres white \nM1 for red costs £8 per litre or white costs £0.50 per litre \nM1 for cost of one 10-litre can is their ‘4’ × their ‘8’ + their ‘6’ × their ‘0.5’ \nM1 for 60 – their ‘35’ \nAlternative method: \nM1 for 2 : 3 = 20 litres red : 30 litres white \nM1 for 2 × £80 + 3 × £5 = £175 \nM1 for '175'5their = 35 \nM1 for 60 – their ‘35’"} | ||
{"full_question": "7 Dan believes he knows what his brother Ethan is thinking. He carries out an experiment to test this. Dan and Ethan sit back-to-back. Ethan rolls an ordinary fair dice. Ethan then thinks about the number on the dice while Dan tries to predict this number . (a) In 300 attempts, how many correct predictions would you expect Dan to make if he was just guessing? (a) .............................. [2] (b) The results of the first 15 attempts are shown in the table. Ethan’s number 2 6 5 3 2 1 5 1 3 4 4 6 1 6 5 Dan’s prediction 2 4 3 1 2 6 1 6 1 6 4 3 2 6 5 Matching pair ✓ ✓ ✓ Estimate the probability of getting a matching pair using the results of (i) the first five attempts, (b)(i) ........................... [1] (ii) all 15 attempts. (ii) .......................... [1] (c) Use answers from (a) and (b) to comment on Dan’s belief that he knows what Ethan is thinking. [2]", "full_answer": "(a) 50 2 2 AO1.3a B1 for 16 (b) (i) 25 oe 1 1 AO2.1b (ii) 15 oe 1 1 AO2.1b (c) No evidence that Dan knows what Ethan is thinking as over the 15 trials the relative frequency of 15 is very close to the theoretical probability of 16 2 1 AO2.5a 1 AO3.3 M1 for reason not including reference to 15 relative frequency or 16 theoretical probability FT their (a) and (b)"} | ||
{"full_question": "9 (a) Anna estimates the height of a tree. Anna holds a ruler vertically so the height of the tree is exactly covered by the ruler. She is 20 metres from the tree. The ruler is 30 cm long. The horizontal distance from her eyes to the ruler is 60 cm. Calculate an estimate of the height of the tree. (a) .......................... m [3] (b) Give two reasons why this method may not be suitable to estimate the height of a very tall building. 1 2 [2]", "full_answer": "9 (a) 10 metres 3 1 AO1.3a 2 AO3.1c M1 for correct ratio height 30=20 60 oe M1 rearrange Or M1 for scale factor 0.5 M1 for 20 × 0.5 (b) 2 valid reasons, e.g. She would have to be very far from the building. The estimate is likely to be inaccurate due to the scale factors at the distances involved. 2 2 AO3.4a"} | ||
{"full_question": "10 ABCD is a parallelogram. Prove that triangle ABD is congruent to triangle CDB. [3]", "full_answer": "e.g. BD is common ABD = BDC (alternate angles) AB = CD (parallelogram) So triangles ABD and CBD are congruent by SAS"} | ||
{"full_question": "11 (a) Give one reason why 0 is an even number.\n [1]\n (b) The lengths of the sides of a right-angled triangle are all integers. \n Prove that if the lengths of the two shortest sides are even, then the length of the third side must also be even.\n[3]", "full_answer": "11 (a) Any correct reason 1 1 AO2.4a Exemplar responses: -1 and 1 both odd and either side of 0 Or can be divided by 2 exactly Or numbers that end in 0 are even Or zero remainder when divided by 2 Or next number in pattern of even numbers 8 6 4 2 Or added to an even number it gives even answer and added to odd number gives odd answer (b) e.g. a2 + b2 = c2 a = 2x and b = 2y implies c2 = 4x2 + 4y2 So c is even 3 1 AO2.1a 1 AO2.4b 1 AO3.2 B1 for use of Pythagoras’ theorem M1 for even × even = even soi"} | ||
{"full_question": "13 The volume of Earth is 1.08 × 1012 km3. The volume of Jupiter is 1.43 × 1015 km3. How many times larger is the radius of Jupiter than the radius of Earth? Assume that Jupiter and Earth are both spheres. [The volume v of a sphere with radius r is v = πr^3.] [4]", "full_answer": "11 or better 4 2 AO1.3b 1 AO3.1b 1 AO3.2 M1 for r = \\sqrt[3]{\\frac{v}{\\pi}} soi A1 for r (Earth) = 6365 km or r (Jupiter) = 69890 km M1 for \\frac{their '69890'}{their '6365'} Alternative method: M1 for \\frac{\\sqrt[3]{1.43 \\times 10^{15}}}{\\sqrt[3]{1.08 \\times 10^{12}}} A1 for 13.24[.074...] M1 for \\sqrt[3]{13.24}"} | ||
{"full_question": "15 At a constant temperature, the volume of a gas V is inversely proportional to its pressure p. By what percentage will the pressure of a gas change if its volume increases by 25%? \n............................... % [4]", "full_answer": "15 20 [decrease](%) 4\n1 AO1.1\n1 AO1.3b\n2 AO3.1d M1 for pV = constant oe M1 for pinitialVinitial = pafterVafter oe M1 for 1 × 1 = pafter × 1.25 oe"} | ||
{"full_question": "A, B, C and D are points on the circumference of a circle, centre O. AC is a diameter of the circle. Angle ABD = 58°. Angle CDB = 22°. Work out the sizes of angle ACD and ACB, giving reasons for your answers. (a) Angle ACD = ...........................° [2] (b) Angle ACB = ...........................° [3]", "full_answer": "(a) 58° Subtended on same arc oe 2 1 AO2.1a 1 AO2.4b B1 for angle (b) 68° e.g. angle DBC is 32° because the angle in a semicircle is a right angle oe so angle ACB is 68° because angles in a triangle sum to 180° oe 3 2 AO2.1a 1 AO2.4b B1 for using the angle in a semicircle is a right angle B1 for using angles in a triangle sum to 180°"} | ||
{"full_question": "A restaurant menu has 8 starters, 12 mains and 6 desserts. A customer can choose from the following meals • a starter and a main, • a main and a dessert, • a starter, a main and a dessert. Show that there are 744 different ways of choosing a meal at this restaurant. [3]", "full_answer": "Starter and main 8 × 12 Main and dessert 12 × 6 Three courses 8 × 12 × 6 96 + 72 + 576 = 744"} | ||
{"full_question": "19 A sequence is defined by the term-to-term rule un+1 = un2 - 8un + 17.\n (a) Given that u1 = 4, find u2 and u3.\n(a) ....................... ....................... [2]\n (b) Given instead that u1 = 2, find u2, u3 and u100.\n(b) ....................... ....................... ....................... [3]", "full_answer": "19 (a) 1 nfww 10 nfww 2\n1 AO1.2 1 AO1.3a B1 for each FT their ‘u2’ for u3\n(b) 5 nfww 2 nfww 5 nfww 3\n1 AO1.2 1 AO1.3a 1 AO2.1a B1 for each FT their ‘u2’ for u3"} | ||
{"full_question": "20 (a) Express as a single fraction. Simplify your answer.\n(a) .......................................... [2]\n(b) Using your answer to part (a), prove that if m and n are positive integers and m < n, then [2]", "full_answer": "20 (a) –(n + 1)/mn\n(b) m < n n – m > 0 n(n+1)–m(n+1) > 0 (n+1)(n–m) > 0 \n\nM1 for their ‘–(n+1)/mn’ > 0"} |
Binary file added
BIN
+797 KB
.../education_assistant/resources/maths/169001-higher-tier-sample-assessment-materials_a.pdf
Binary file not shown.
Oops, something went wrong.