diff --git a/labs/041-embeddings-basics/.gitignore b/labs/041-embeddings-basics/.gitignore new file mode 100644 index 0000000..4a08260 --- /dev/null +++ b/labs/041-embeddings-basics/.gitignore @@ -0,0 +1 @@ +embeddings.json \ No newline at end of file diff --git a/labs/041-embeddings-basics/.vscode/launch.json b/labs/041-embeddings-basics/.vscode/launch.json new file mode 100644 index 0000000..d876024 --- /dev/null +++ b/labs/041-embeddings-basics/.vscode/launch.json @@ -0,0 +1,22 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "node", + "request": "launch", + "name": "Launch Program", + "skipFiles": [ + "/**" + ], + "program": "${workspaceFolder}/src/main.ts", + "preLaunchTask": "npm: build", + "outFiles": [ + "${workspaceFolder}/dist/**/*.js" + ], + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/labs/041-embeddings-basics/.vscode/tasks.json b/labs/041-embeddings-basics/.vscode/tasks.json new file mode 100644 index 0000000..f69f865 --- /dev/null +++ b/labs/041-embeddings-basics/.vscode/tasks.json @@ -0,0 +1,16 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "npm", + "script": "build", + "group": { + "kind": "build", + "isDefault": true + }, + "problemMatcher": [], + "label": "npm: build", + "detail": "tsc" + } + ] +} \ No newline at end of file diff --git a/labs/041-embeddings-basics/package-lock.json b/labs/041-embeddings-basics/package-lock.json new file mode 100644 index 0000000..8b75b03 --- /dev/null +++ b/labs/041-embeddings-basics/package-lock.json @@ -0,0 +1,419 @@ +{ + "name": "041-embeddings-basics", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "041-embeddings-basics", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "dotenv": "^16.4.5", + "mathjs": "^13.0.3", + "openai": "^4.29.2" + }, + "devDependencies": { + "@types/node": "^22.1.0", + "typescript": "^5.4.3" + } + }, + "node_modules/@babel/runtime": { + "version": "7.25.0", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.25.0.tgz", + "integrity": "sha512-7dRy4DwXwtzBrPbZflqxnvfxLF8kdZXPkhymtDeFoFqE6ldzjQFgYTtYIFARcLEYDrqfBfYcZt1WqFxRoyC9Rw==", + "license": "MIT", + "dependencies": { + "regenerator-runtime": "^0.14.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@types/node": { + "version": "22.1.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.1.0.tgz", + "integrity": "sha512-AOmuRF0R2/5j1knA3c6G3HOk523Ga+l+ZXltX8SF1+5oqcXijjfTd8fY3XRZqSihEu9XhtQnKYLmkFaoxgsJHw==", + "license": "MIT", + "dependencies": { + "undici-types": "~6.13.0" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.11.tgz", + "integrity": "sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/complex.js": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/complex.js/-/complex.js-2.1.1.tgz", + "integrity": "sha512-8njCHOTtFFLtegk6zQo0kkVX1rngygb/KQI6z1qZxlFI3scluC+LVTCFbrkWjBv4vvLlbQ9t88IPMC6k95VTTg==", + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "patreon", + "url": "https://www.patreon.com/infusion" + } + }, + "node_modules/decimal.js": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz", + "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA==", + "license": "MIT" + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/dotenv": { + "version": "16.4.5", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", + "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, + "node_modules/escape-latex": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/escape-latex/-/escape-latex-1.2.0.tgz", + "integrity": "sha512-nV5aVWW1K0wEiUIEdZ4erkGGH8mDxGyxSeqPzRNtWP7ataw+/olFObw7hujFWlVjNsaDFw5VZ5NzVSIqRgfTiw==", + "license": "MIT" + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==", + "license": "MIT" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "license": "MIT", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/fraction.js": { + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", + "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==", + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "patreon", + "url": "https://github.com/sponsors/rawify" + } + }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, + "node_modules/javascript-natural-sort": { + "version": "0.7.1", + "resolved": "https://registry.npmjs.org/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz", + "integrity": "sha512-nO6jcEfZWQXDhOiBtG2KvKyEptz7RVbpGP4vTD2hLBdmNQSsCiicO2Ioinv6UI4y9ukqnBpy+XZ9H6uLNgJTlw==", + "license": "MIT" + }, + "node_modules/mathjs": { + "version": "13.0.3", + "resolved": "https://registry.npmjs.org/mathjs/-/mathjs-13.0.3.tgz", + "integrity": "sha512-GpP9OW6swA5POZXvgpc/1FYkAr8lKgV04QHS1tIU60klFfplVCYaNzn6qy0vSp0hAQQN7shcx9CeB507dlLujA==", + "license": "Apache-2.0", + "dependencies": { + "@babel/runtime": "^7.24.8", + "complex.js": "^2.1.1", + "decimal.js": "^10.4.3", + "escape-latex": "^1.2.0", + "fraction.js": "^4.3.7", + "javascript-natural-sort": "^0.7.1", + "seedrandom": "^3.0.5", + "tiny-emitter": "^2.1.0", + "typed-function": "^4.2.1" + }, + "bin": { + "mathjs": "bin/cli.js" + }, + "engines": { + "node": ">= 18" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/openai": { + "version": "4.55.1", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.55.1.tgz", + "integrity": "sha512-FziYJcWl+SAGbt5AcRIzVzNcnKohpEMQdtzVOmHFbBp/if7x2+ACqgxF2XUbyi2PcKONPcVpmtG5h9qoDAEXwQ==", + "license": "Apache-2.0", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + }, + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/@types/node": { + "version": "18.19.43", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.43.tgz", + "integrity": "sha512-Mw/YlgXnyJdEwLoFv2dpuJaDFriX+Pc+0qOBJ57jC1H6cDxIj2xc5yUrdtArDVG0m+KV6622a4p2tenEqB3C/g==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/openai/node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, + "node_modules/regenerator-runtime": { + "version": "0.14.1", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz", + "integrity": "sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==", + "license": "MIT" + }, + "node_modules/seedrandom": { + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/seedrandom/-/seedrandom-3.0.5.tgz", + "integrity": "sha512-8OwmbklUNzwezjGInmZ+2clQmExQPvomqjL7LFqOYqtmuxRgQYqOD3mHaU+MvZn5FLUeVxVfQjwLZW/n/JFuqg==", + "license": "MIT" + }, + "node_modules/tiny-emitter": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/tiny-emitter/-/tiny-emitter-2.1.0.tgz", + "integrity": "sha512-NB6Dk1A9xgQPMoGqC5CVXn123gWyte215ONT5Pp5a0yt4nlEoO1ZWeCwpncaekPHXO60i47ihFnZPiRPjRMq4Q==", + "license": "MIT" + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, + "node_modules/typed-function": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/typed-function/-/typed-function-4.2.1.tgz", + "integrity": "sha512-EGjWssW7Tsk4DGfE+5yluuljS1OGYWiI1J6e8puZz9nTMM51Oug8CD5Zo4gWMsOhq5BI+1bF+rWTm4Vbj3ivRA==", + "license": "MIT", + "engines": { + "node": ">= 18" + } + }, + "node_modules/typescript": { + "version": "5.5.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz", + "integrity": "sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.13.0.tgz", + "integrity": "sha512-xtFJHudx8S2DSoujjMd1WeWvn7KKWFRESZTMeL1RptAYERu29D6jphMjjY+vn96jvN3kVPDNxU/E13VTaXj6jg==", + "license": "MIT" + }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + } + } +} diff --git a/labs/041-embeddings-basics/package.json b/labs/041-embeddings-basics/package.json new file mode 100644 index 0000000..a35a4e9 --- /dev/null +++ b/labs/041-embeddings-basics/package.json @@ -0,0 +1,22 @@ +{ + "name": "041-embeddings-basics", + "version": "1.0.0", + "main": "index.js", + "type": "module", + "scripts": { + "build": "tsc", + "start": "npm run build && node ./dist/main.js" + }, + "keywords": [], + "author": "", + "license": "ISC", + "devDependencies": { + "@types/node": "^22.1.0", + "typescript": "^5.4.3" + }, + "dependencies": { + "dotenv": "^16.4.5", + "mathjs": "^13.0.3", + "openai": "^4.29.2" + } +} \ No newline at end of file diff --git a/labs/041-embeddings-basics/readme.md b/labs/041-embeddings-basics/readme.md new file mode 100644 index 0000000..702ef7d --- /dev/null +++ b/labs/041-embeddings-basics/readme.md @@ -0,0 +1,55 @@ +# Introduction to Embeddings with OpenAI + +This project demonstrates how to use OpenAI's API to generate embeddings for text inputs and compare their similarities. By following this code, you will learn how to: + +* Use OpenAI's API to create embeddings for the input texts. +* Inspect the embeddings and the magnitude of the embeddings. +* Compute the dot product of embeddings manually and using the _mathjs_ library. +* Compare the similarities between different text inputs based on their embeddings. + +## Usage + +* Follow the prompts to enter three different texts. +* Observe the output, which includes: + * The first ten elements of each embedding. + * The magnitude of each embedding. + * The dot product of the first text with the second and third texts. + * A comparison of which text is more similar to the first text. + +## Example inputs + +* Although sentences 1 and 2 use different words (soccer, football), the cosine similarity of 1 and 2 is higher compared to 1 and 3 because the meaning of 1 and 2 is more similar. + * I enjoy playing soccer on weekends. + * Football is my favorite sport. Playing it on weekends with friends helps me to relax. + * In Austria, people often watch soccer on TV on weekends. + +* Here we test whether the OpenAI embedding model "understands", that the contextual meaning of "Java" is different in sentences 1 and 2. Therefore, the cosine similarity of 1 and 3 is higher as both are programming-related. + * He is interested in Java programming. + * He visited Java last summer. + * He recently started learning Python programming. + +* The next example deals with negation handling. All three sentences are about whether someone likes going to the gym. Sentences 1 and 3 are positive (i.e. like training in the gym), while 2 is not. Therefore, 1 and 3 have a higher cosine similarity. + * I like going to the gym. + * I don't like going to the gym. + * I don't dislike going to the gym. + +* Let's take a look at idiomatic expressions. Sentences 1 and 2 have very similar meaning. 3 also contains "cats and dogs", but the meaning is different. As a result, cosine similarity between 1 and 2 is higher. + * It's raining cats and dogs. + * The weather is very bad, it's pouring outside. + * Cats and dogs don't go outside when it rains. + +* The next examples demonstrate that embedding models have been pre-trained with data about the real world. They understand certain domain-specific terms like "virus" and "Voron". + * The computer was infected with a virus. + * The patient's viral load is detectable. + * She is updating the antivirus software on her laptop. + + * I need to get better slicing skills to make the most of my Voron. + * 3D printing is a worth-while hobby. + * Can I have a slice of bread? + +* The last example demonstrates the limits of embeddings. Berry Harris is a well-known teacher in Jazz. Using "the 6th on the 5th" is typical for him. One must know Berry Harris and the musical theory that he has taught to understand the similarity of the sentences 1 and 2. OpenAI embeddings do not understand that. + * I like how Barry Harris described Jazz theory. + * Playing the 6th on the 5th is an important concept that you must understand. + * My friends Barry and Harris often visit me to play computer games. + +Come up with your own examples and test the embeddings! diff --git a/labs/041-embeddings-basics/src/input.ts b/labs/041-embeddings-basics/src/input.ts new file mode 100644 index 0000000..36fe572 --- /dev/null +++ b/labs/041-embeddings-basics/src/input.ts @@ -0,0 +1,18 @@ +import readline from 'readline'; + +/** + * Read a line from the console. + */ +export function readLine(prompt: string): Promise { + return new Promise((resolve) => { + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout + }); + + rl.question(prompt, (answer: string) => { + rl.close(); + resolve(answer); + }); + }); +} diff --git a/labs/041-embeddings-basics/src/main.ts b/labs/041-embeddings-basics/src/main.ts new file mode 100644 index 0000000..53fd50f --- /dev/null +++ b/labs/041-embeddings-basics/src/main.ts @@ -0,0 +1,56 @@ +import OpenAI from "openai"; +import dotenv from "dotenv"; +import { readLine } from "./input.js"; +import { dot, norm } from "mathjs"; + +dotenv.config({ path: "../../.env" }); + +const openai = new OpenAI({ + apiKey: process.env.OPENAI_KEY, +}); + +const text1 = await readLine("Enter the first text: "); +const text2 = await readLine("Enter the second text: "); +const text3 = await readLine("Enter the third text: "); + +console.log( + "Now we check whether the first text is more similar to the second text or the third text." +); + +const embeddings = await openai.embeddings.create({ + model: process.env.OPENAI_EMBEDDINGS ?? "", + input: [text1, text2, text3], +}); + +// For demo purposes, we will print the first ten elements of the embeddings +for (const e of embeddings.data) { + console.log(e.embedding.slice(0, 10)); +} + +// For demo purposes, we print the magnitude of the embeddings +for (const e of embeddings.data) { + console.log(norm(e.embedding)); +} + +// Let's calculate the dot product of the first two embeddings. +// First, we do not use a method from mathjs but calculate it manually. +let dotProduct = 0; +for (let i = 0; i < embeddings.data[0].embedding.length; i++) { + dotProduct += embeddings.data[0].embedding[i] * embeddings.data[1].embedding[i]; +} +console.log('dot product of t1 and t2', dotProduct); + +// Now we use the mathjs method to calculate the dot product +const dotProductMathjs = dot(embeddings.data[0].embedding, embeddings.data[1].embedding); +console.log('dot product of t1 and t2 using mathjs', dotProductMathjs); + +// Next, we calculate the dot product of the first and third embeddings. +const dotProductMathjs2 = dot(embeddings.data[0].embedding, embeddings.data[2].embedding); +console.log('dot product of t1 and t3 using mathjs', dotProductMathjs2); + +// Now we compare the similarities +if (dotProductMathjs > dotProductMathjs2) { + console.log('The first text is more similar to the second text.'); +} else { + console.log('The first text is more similar to the third text.'); +} \ No newline at end of file diff --git a/labs/041-embeddings-basics/tsconfig.json b/labs/041-embeddings-basics/tsconfig.json new file mode 100644 index 0000000..b506f08 --- /dev/null +++ b/labs/041-embeddings-basics/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "es2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "rootDir": "./src", + "inlineSourceMap": true, + "outDir": "./dist", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true + } +}