diff --git a/workshops/2024_Digital_History_Halle/02_Autoren_extrahieren.ipynb b/workshops/2024_Digital_History_Halle/02_Autoren_extrahieren.ipynb new file mode 100644 index 0000000..9096d6e --- /dev/null +++ b/workshops/2024_Digital_History_Halle/02_Autoren_extrahieren.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "56184c51-b4cd-4009-b53c-40f73bb4e14a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22cfe659-cb0d-42c4-830f-d10deb947f4e", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"Studentenbewegung.csv\", encoding=\"utf-8\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49e471cb-3e2f-46ae-8b44-3707b8dd5d79", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(['titel', 'rela'], axis=1)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b37bdbf-cf0f-4ebe-a1f3-3d0406a411dc", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df['author'] != 'unknown']\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "54b7dbc7-4bb0-42b2-b642-c97dcbf6b3fe", + "metadata": {}, + "source": [ + "### Bereinigen: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2b544f6-c1d9-4391-8fe5-12815a31a09d", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[:, 'gnd_modified'] = df['gnd'].str.split(';').str.get(0)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83716027-df06-4469-a03e-b40905f937b0", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(['gnd'], axis=1)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66378256-9230-44b8-9243-026c7ca5e171", + "metadata": {}, + "outputs": [], + "source": [ + "count_unknown = (df['gnd_modified'] == 'unknown').sum()\n", + "print(\"Einträge in gnd_modified mit value 'unknown': \", count_unknown)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b37202ea-4067-483f-bd93-41f4d6dc43a3", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"authors.csv\", encoding=\"utf-8\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffad82ee-3945-4a99-a11b-b36f127500df", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workshops/2024_Digital_History_Halle/04_Kartendarstellung.ipynb b/workshops/2024_Digital_History_Halle/04_Kartendarstellung.ipynb new file mode 100644 index 0000000..9c9392b --- /dev/null +++ b/workshops/2024_Digital_History_Halle/04_Kartendarstellung.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a0a353f0-04f8-4d56-8285-5ce440aa88fb", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from geopy.geocoders import Nominatim\n", + "from geopy.exc import GeocoderTimedOut\n", + "import plotly.express as px" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85fd2f76-4a65-4310-a82f-d1e0b6168748", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"authors-csv.csv\", encoding=\"utf-8\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd119b8f-ff17-4055-8a0d-9c1b571c3b38", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialisieren des Geolocators: \n", + "geolocator = Nominatim(user_agent=\"geoapiExercises\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce67491f-88de-44ae-8ebb-62707b78a477", + "metadata": {}, + "outputs": [], + "source": [ + "# Funktion, um Longitude und Latitude zu ermitteln:\n", + "def get_lat_long(place):\n", + " try:\n", + " location = geolocator.geocode(place)\n", + " if location:\n", + " return location.latitude, location.longitude\n", + " else:\n", + " return None, None\n", + " except GeocoderTimedOut:\n", + " return None, None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec029f1f-20d2-4bd9-86ab-91aee865fe1c", + "metadata": {}, + "outputs": [], + "source": [ + "# Funktion auf der Spalte \"place\" ausführen, um neue Spalten \"lat\" und \"long\" zu ergänzen: \n", + "df[['lat', 'long']] = df['place'].apply(lambda x: pd.Series(get_lat_long(x)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880d0dd3-7164-4091-985b-dca67334ec3e", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dcbaa61-c183-4f97-a0fd-168e5871ec9d", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.dropna(subset=['lat', 'long'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c122059-d170-4c5c-9134-317c684d30b1", + "metadata": {}, + "outputs": [], + "source": [ + "# Gruppieren bei \"place\" und zählen der Vorkommen der einzelnen Orte:\n", + "place_counts = df.groupby('place').size().reset_index(name='count')\n", + "place_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3952f9a7-8d8c-45bd-ac7a-eb6239ef51af", + "metadata": {}, + "outputs": [], + "source": [ + "# Zusammenführen der Zählungen mit dem ursprünglichen Dataframe in einem neuen Dataframe \"df_with_counts\":\n", + "df_with_counts = df.merge(place_counts, on='place')\n", + "df_with_counts" + ] + }, + { + "cell_type": "markdown", + "id": "0d8647d3-ac4c-4bd4-aa44-1829e2707484", + "metadata": {}, + "source": [ + "### Kartenvisualisierung erstellen:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64e8cadd-be66-4dff-a9f6-206b02b5a963", + "metadata": {}, + "outputs": [], + "source": [ + "# Karte erstellen:\n", + "fig = px.scatter_mapbox(df_with_counts, \n", + " lat=\"lat\", \n", + " lon=\"long\", \n", + " hover_name=\"place\", \n", + " hover_data=[\"count\"],\n", + " size=\"count\",\n", + " color=\"count\",\n", + " zoom=4,\n", + " height=600)\n", + "\n", + "# Update des Layouts\n", + "fig.update_layout(mapbox_style=\"open-street-map\")\n", + "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n", + "\n", + "# Karte anzeigen: \n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83307d4d-f459-4064-a4ae-e08b6e6648da", + "metadata": {}, + "outputs": [], + "source": [ + "# Karte bei Bedarf als HTML-Datei speichern: \n", + "fig.write_html(\"places_map_plotly.html\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workshops/2024_Digital_History_Halle/05_PDFs_herunterladen.ipynb b/workshops/2024_Digital_History_Halle/05_PDFs_herunterladen.ipynb new file mode 100644 index 0000000..2e530d3 --- /dev/null +++ b/workshops/2024_Digital_History_Halle/05_PDFs_herunterladen.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c3e35d02-d746-4493-967a-8bc0147b4db9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import wget\n", + "import PyPDF2\n", + "from PyPDF2 import PdfReader\n", + "import urllib.parse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2978b485-bd48-45e0-8460-22f9fe9b5d4f", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"Studentenbewegung_pdfs.csv\", encoding=\"utf-8\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48b358c3-23a5-449a-8a6b-39a8d258c097", + "metadata": {}, + "outputs": [], + "source": [ + "links = df[\"links\"]\n", + "print(links)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dc537a0-8479-4a17-a08d-296cefe38b7c", + "metadata": {}, + "outputs": [], + "source": [ + "print(links[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c41d1db-3784-4111-80ef-eecb0e9f84c1", + "metadata": {}, + "outputs": [], + "source": [ + "linklist = []\n", + "\n", + "for element in links:\n", + " related_links = element.split(\"; \")\n", + " for link in related_links: \n", + " if link.startswith(\"https://d-nb.info/\"):\n", + " #print(link)\n", + " linklist.append(link)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9c46e26-546c-46af-bef4-cc06d12fc7d9", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(linklist))" + ] + }, + { + "cell_type": "markdown", + "id": "ae473829-9add-4ba2-8d37-d4e5eb3a974f", + "metadata": {}, + "source": [ + "#### Alternative: \n", + "```\n", + "linklist = []\n", + "\n", + "for idn in df.idn: \n", + " link = \"https://d-nb.info/\" + idn + \"/34\"\n", + " linklist.append(link) ```" + ] + }, + { + "cell_type": "markdown", + "id": "9f958ef9-e31c-4b3e-b958-36b89b5c4bfc", + "metadata": {}, + "source": [ + "## Herunterladen der PDF-Dateien\n", + "\n", + "Gekürzte Liste erstellen (damit es schneller geht):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d133ad99-b713-4bea-9ada-2ab5f5ba2111", + "metadata": {}, + "outputs": [], + "source": [ + "testlist = linklist[60:70]\n", + "print(testlist)" + ] + }, + { + "cell_type": "markdown", + "id": "48f7c8d5-29c0-4210-baf5-5caf3557dce0", + "metadata": {}, + "source": [ + "Funktion zum Herunterladen definieren, die zunächst ein neues Verzeichnis für die gesammelten Dateien erstellt und diese dann herunterlädt und unter Nutzung der IDN als Dateiname abspeichert: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "975bffd0-f668-428d-b189-69aee98afd97", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "def download_text(testlist, save_directory):\n", + " # Erstellen des später definierten Speicherverzeichnis, falls es dieses noch nicht gibt\n", + " if not os.path.exists(save_directory):\n", + " os.makedirs(save_directory)\n", + " \n", + " for link in testlist: \n", + " # IDN ermitteln:\n", + " idn = link.split('/')[-2]\n", + " # Original-Dateiname und -Endung aus Content-Disposition header ermitteln und als Basis nutzen:\n", + " response = requests.head(link)\n", + " content_disposition = response.headers.get('Content-Disposition')\n", + " orig_filename = content_disposition.split('filename=')[-1].strip('\";')\n", + " print(orig_filename)\n", + "\n", + " # Dateiname aus IDN und originalem Dateiname zusammensetzen: \n", + " file_name = f\"{idn}_{orig_filename}\"\n", + " file_path = os.path.join(save_directory, file_name)\n", + " print(f\" Starte Download: {file_name}\")\n", + " try: \n", + " #Datei herunterladen:\n", + " wget.download(link, out=file_path)\n", + " except Exception as e:\n", + " print(f\"Fehler beim Herunterladen von {link}: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64fdc34b-49fe-41c9-b5e9-99545e4c7ac2", + "metadata": {}, + "outputs": [], + "source": [ + "save_directory = \"downloads\"\n", + "\n", + "download_text(testlist, save_directory)" + ] + }, + { + "cell_type": "markdown", + "id": "2d5864c5-a1d3-4bf3-9d71-9739c1da02fb", + "metadata": {}, + "source": [ + "### Test: Einlesen einer Seite eines PDFs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d808a4a6-ae0d-4569-96be-fc1d86544f93", + "metadata": {}, + "outputs": [], + "source": [ + "def test_read_pdf(file_path, file_name):\n", + " reader = PdfReader(file_path + \"/\" + file_name)\n", + " number_of_pages = len(reader.pages)\n", + " page = reader.pages[10]\n", + " text = page.extract_text()\n", + " print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a10b0292-be99-4bcf-aa23-2e0b786d9842", + "metadata": {}, + "outputs": [], + "source": [ + "test_read_pdf(\"downloads\", \"1334711879.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "b96d9085-3900-4794-a2fc-09bd4d25429a", + "metadata": {}, + "source": [ + "### Einlesen einer kompletten PDF-Datei:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c18ddf2f-c7e7-444e-9a47-9253a5cba71f", + "metadata": {}, + "outputs": [], + "source": [ + "def read_pdf_content(file_path, filename):\n", + " try:\n", + " reader = PdfReader(file_path + \"/\" + filename)\n", + " number_of_pages = len(reader.pages)\n", + " text = \"\"\n", + " for page in reader.pages:\n", + " text += page.extract_text() # + \"\\n\"\n", + " return text\n", + " except Exception as e:\n", + " print(f\"Fehler beim Einlesen der Datei {filename}: {e}\")\n", + " return \"Fehler beim Einlesen der Datei\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75610ab9-e0c7-43c4-947a-9598a00bcb9b", + "metadata": {}, + "outputs": [], + "source": [ + "test = read_pdf_content(\"downloads\", \"1192129652_40257_1.pdf\")\n", + "print(test)" + ] + }, + { + "cell_type": "markdown", + "id": "37d3998e-a306-4cc6-9b5a-de2b69d31634", + "metadata": {}, + "source": [ + "### Einlesen **aller** Dateien aus dem \"downloads\"-Verzeichnis, die auf .pdf enden. Anschließend werden die Texte im Zusammenhang mit ihrem Dateinamen in ein Pandas_Dataframe überführt: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcb98ba6-cbe1-4822-ad1d-e05a70cfe918", + "metadata": {}, + "outputs": [], + "source": [ + "def get_text_from_pdfs(file_path):\n", + " pdf_text = {}\n", + " for filename in os.listdir(file_path):\n", + " if filename.endswith(\".pdf\"):\n", + " text = read_pdf_content(file_path, filename)\n", + " pdf_text[filename] = text\n", + " return pdf_text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "952f19bd-3c11-48e5-a991-c886293e2c15", + "metadata": {}, + "outputs": [], + "source": [ + "pdf_texts = get_text_from_pdfs(\"downloads\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b201873-d005-4cc9-a00f-54125700b326", + "metadata": {}, + "outputs": [], + "source": [ + "print(pdf_texts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fea2a889-9390-413e-b132-ffba8a1eae90", + "metadata": {}, + "outputs": [], + "source": [ + "#Löschen der ganzen Zeilenumbrüche (\"\\n\"): \n", + "pdf_texts = {key: value.replace('\\n', ' ') for key, value in pdf_texts.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c33883-3180-408b-a92e-bce7df796b46", + "metadata": {}, + "outputs": [], + "source": [ + "for key, value in pdf_texts.items():\n", + " print(f\"Key: {key}, Type of value: {type(value)}\")\n", + " if value is None:\n", + " print(f\"None value found for key: {key}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fc6db13-f1e0-4716-a3d6-114bcbf19bac", + "metadata": {}, + "outputs": [], + "source": [ + "#Umwandeln in ein Dataframe:\n", + "df_texts = pd.DataFrame.from_dict(pdf_texts, orient='index', columns=['Text'])\n", + "df_texts.reset_index(inplace=True)\n", + "df_texts.columns = ['filename', 'text']\n", + "df_texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c755735-c172-4911-a8ae-73d7e24dc4c0", + "metadata": {}, + "outputs": [], + "source": [ + "df_texts.to_hdf(\"pdf_texts.h5\", key=\"df_texts\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da4a877-fd9f-4ed2-b100-37c5022bd34d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}