diff --git a/Stack_Overflow2018- correlations and analysis.ipynb b/Stack_Overflow2018- correlations and analysis.ipynb
new file mode 100644
index 0000000..6632aa1
--- /dev/null
+++ b/Stack_Overflow2018- correlations and analysis.ipynb
@@ -0,0 +1,6984 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# EDA ON STACK OVERFLOW 2018 DATA"
+ ],
+ "metadata": {
+ "id": "I0-wmKWr9L2v"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**DATA LOADING**"
+ ],
+ "metadata": {
+ "id": "g4EeBybM95Zo"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "*Importing modules*"
+ ],
+ "metadata": {
+ "id": "q4uFubUDAPQN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ],
+ "metadata": {
+ "id": "PofFmURLASrI"
+ },
+ "execution_count": 310,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "*Loading 2018 stackoverflow data*"
+ ],
+ "metadata": {
+ "id": "6nuRfrESAoQC"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df=pd.read_csv(\"/content/survey_results_sample_2018.csv\")"
+ ],
+ "metadata": {
+ "id": "YreYMLwKAW3p"
+ },
+ "execution_count": 311,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "*Initial exploration*"
+ ],
+ "metadata": {
+ "id": "Rm63VMBVA2rm"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#First few rows of the data\n",
+ "df.head(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "NeeuOsj4AzIs",
+ "outputId": "8d9da8dd-3b84-4e6b-e02f-aa4d5432ea39"
+ },
+ "execution_count": 312,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Respondent Hobby OpenSource Country Student \\\n",
+ "0 1 Yes No Kenya No \n",
+ "1 3 Yes Yes United Kingdom No \n",
+ "2 4 Yes Yes United States No \n",
+ "3 5 No No United States No \n",
+ "4 7 Yes No South Africa Yes, part-time \n",
+ "5 8 Yes No United Kingdom No \n",
+ "6 9 Yes Yes United States No \n",
+ "7 10 Yes Yes Nigeria No \n",
+ "8 11 Yes Yes United States No \n",
+ "9 16 No Yes India No \n",
+ "\n",
+ " Employment FormalEducation \\\n",
+ "0 Employed part-time Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "1 Employed full-time Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "2 Employed full-time Associate degree \n",
+ "3 Employed full-time Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "4 Employed full-time Some college/university study without earning ... \n",
+ "5 Employed full-time Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "6 Employed full-time Some college/university study without earning ... \n",
+ "7 Employed full-time Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "8 Employed full-time Some college/university study without earning ... \n",
+ "9 Employed full-time Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "\n",
+ " UndergradMajor \\\n",
+ "0 Mathematics or statistics \n",
+ "1 A natural science (ex. biology, chemistry, phy... \n",
+ "2 Computer science, computer engineering, or sof... \n",
+ "3 Computer science, computer engineering, or sof... \n",
+ "4 Computer science, computer engineering, or sof... \n",
+ "5 Computer science, computer engineering, or sof... \n",
+ "6 Computer science, computer engineering, or sof... \n",
+ "7 Computer science, computer engineering, or sof... \n",
+ "8 Fine arts or performing arts (ex. graphic desi... \n",
+ "9 Computer science, computer engineering, or sof... \n",
+ "\n",
+ " CompanySize \\\n",
+ "0 20 to 99 employees \n",
+ "1 10,000 or more employees \n",
+ "2 20 to 99 employees \n",
+ "3 100 to 499 employees \n",
+ "4 10,000 or more employees \n",
+ "5 10 to 19 employees \n",
+ "6 10,000 or more employees \n",
+ "7 10 to 19 employees \n",
+ "8 100 to 499 employees \n",
+ "9 500 to 999 employees \n",
+ "\n",
+ " DevType ... \\\n",
+ "0 Full-stack developer ... \n",
+ "1 Database administrator;DevOps specialist;Full-... ... \n",
+ "2 Engineering manager;Full-stack developer ... \n",
+ "3 Full-stack developer ... \n",
+ "4 Data or business analyst;Desktop or enterprise... ... \n",
+ "5 Back-end developer;Database administrator;Fron... ... \n",
+ "6 Back-end developer;Front-end developer;Full-st... ... \n",
+ "7 Designer;Front-end developer;QA or test developer ... \n",
+ "8 Back-end developer;C-suite executive (CEO, CTO... ... \n",
+ "9 Designer ... \n",
+ "\n",
+ " Exercise Gender SexualOrientation \\\n",
+ "0 3 - 4 times per week Male Straight or heterosexual \n",
+ "1 Daily or almost every day Male Straight or heterosexual \n",
+ "2 NaN NaN NaN \n",
+ "3 I don't typically exercise Male Straight or heterosexual \n",
+ "4 3 - 4 times per week Male Straight or heterosexual \n",
+ "5 1 - 2 times per week Male Straight or heterosexual \n",
+ "6 I don't typically exercise Male Straight or heterosexual \n",
+ "7 1 - 2 times per week Female NaN \n",
+ "8 I don't typically exercise Male Straight or heterosexual \n",
+ "9 NaN NaN NaN \n",
+ "\n",
+ " EducationParents \\\n",
+ "0 Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "1 Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "2 NaN \n",
+ "3 Some college/university study without earning ... \n",
+ "4 Some college/university study without earning ... \n",
+ "5 Secondary school (e.g. American high school, G... \n",
+ "6 Master’s degree (MA, MS, M.Eng., MBA, etc.) \n",
+ "7 Primary/elementary school \n",
+ "8 Some college/university study without earning ... \n",
+ "9 NaN \n",
+ "\n",
+ " RaceEthnicity Age Dependents MilitaryUS \\\n",
+ "0 Black or of African descent 25 - 34 years old Yes NaN \n",
+ "1 White or of European descent 35 - 44 years old Yes NaN \n",
+ "2 NaN NaN NaN NaN \n",
+ "3 White or of European descent 35 - 44 years old No No \n",
+ "4 White or of European descent 18 - 24 years old Yes NaN \n",
+ "5 White or of European descent 18 - 24 years old No NaN \n",
+ "6 White or of European descent 18 - 24 years old No No \n",
+ "7 Black or of African descent 25 - 34 years old No NaN \n",
+ "8 White or of European descent 35 - 44 years old Yes No \n",
+ "9 NaN NaN NaN NaN \n",
+ "\n",
+ " SurveyTooLong SurveyEasy \n",
+ "0 The survey was an appropriate length Very easy \n",
+ "1 The survey was an appropriate length Somewhat easy \n",
+ "2 NaN NaN \n",
+ "3 The survey was an appropriate length Somewhat easy \n",
+ "4 The survey was an appropriate length Somewhat easy \n",
+ "5 The survey was an appropriate length Somewhat easy \n",
+ "6 The survey was an appropriate length Somewhat easy \n",
+ "7 The survey was too long Somewhat difficult \n",
+ "8 The survey was an appropriate length Very easy \n",
+ "9 NaN NaN \n",
+ "\n",
+ "[10 rows x 129 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Respondent | \n",
+ " Hobby | \n",
+ " OpenSource | \n",
+ " Country | \n",
+ " Student | \n",
+ " Employment | \n",
+ " FormalEducation | \n",
+ " UndergradMajor | \n",
+ " CompanySize | \n",
+ " DevType | \n",
+ " ... | \n",
+ " Exercise | \n",
+ " Gender | \n",
+ " SexualOrientation | \n",
+ " EducationParents | \n",
+ " RaceEthnicity | \n",
+ " Age | \n",
+ " Dependents | \n",
+ " MilitaryUS | \n",
+ " SurveyTooLong | \n",
+ " SurveyEasy | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Yes | \n",
+ " No | \n",
+ " Kenya | \n",
+ " No | \n",
+ " Employed part-time | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " Mathematics or statistics | \n",
+ " 20 to 99 employees | \n",
+ " Full-stack developer | \n",
+ " ... | \n",
+ " 3 - 4 times per week | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " Black or of African descent | \n",
+ " 25 - 34 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Very easy | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United Kingdom | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " A natural science (ex. biology, chemistry, phy... | \n",
+ " 10,000 or more employees | \n",
+ " Database administrator;DevOps specialist;Full-... | \n",
+ " ... | \n",
+ " Daily or almost every day | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " White or of European descent | \n",
+ " 35 - 44 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United States | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Associate degree | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 20 to 99 employees | \n",
+ " Engineering manager;Full-stack developer | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5 | \n",
+ " No | \n",
+ " No | \n",
+ " United States | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 100 to 499 employees | \n",
+ " Full-stack developer | \n",
+ " ... | \n",
+ " I don't typically exercise | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Some college/university study without earning ... | \n",
+ " White or of European descent | \n",
+ " 35 - 44 years old | \n",
+ " No | \n",
+ " No | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 7 | \n",
+ " Yes | \n",
+ " No | \n",
+ " South Africa | \n",
+ " Yes, part-time | \n",
+ " Employed full-time | \n",
+ " Some college/university study without earning ... | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 10,000 or more employees | \n",
+ " Data or business analyst;Desktop or enterprise... | \n",
+ " ... | \n",
+ " 3 - 4 times per week | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Some college/university study without earning ... | \n",
+ " White or of European descent | \n",
+ " 18 - 24 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 8 | \n",
+ " Yes | \n",
+ " No | \n",
+ " United Kingdom | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 10 to 19 employees | \n",
+ " Back-end developer;Database administrator;Fron... | \n",
+ " ... | \n",
+ " 1 - 2 times per week | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Secondary school (e.g. American high school, G... | \n",
+ " White or of European descent | \n",
+ " 18 - 24 years old | \n",
+ " No | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 9 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United States | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Some college/university study without earning ... | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 10,000 or more employees | \n",
+ " Back-end developer;Front-end developer;Full-st... | \n",
+ " ... | \n",
+ " I don't typically exercise | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Master’s degree (MA, MS, M.Eng., MBA, etc.) | \n",
+ " White or of European descent | \n",
+ " 18 - 24 years old | \n",
+ " No | \n",
+ " No | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 10 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " Nigeria | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 10 to 19 employees | \n",
+ " Designer;Front-end developer;QA or test developer | \n",
+ " ... | \n",
+ " 1 - 2 times per week | \n",
+ " Female | \n",
+ " NaN | \n",
+ " Primary/elementary school | \n",
+ " Black or of African descent | \n",
+ " 25 - 34 years old | \n",
+ " No | \n",
+ " NaN | \n",
+ " The survey was too long | \n",
+ " Somewhat difficult | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 11 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United States | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Some college/university study without earning ... | \n",
+ " Fine arts or performing arts (ex. graphic desi... | \n",
+ " 100 to 499 employees | \n",
+ " Back-end developer;C-suite executive (CEO, CTO... | \n",
+ " ... | \n",
+ " I don't typically exercise | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Some college/university study without earning ... | \n",
+ " White or of European descent | \n",
+ " 35 - 44 years old | \n",
+ " Yes | \n",
+ " No | \n",
+ " The survey was an appropriate length | \n",
+ " Very easy | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 16 | \n",
+ " No | \n",
+ " Yes | \n",
+ " India | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " Computer science, computer engineering, or sof... | \n",
+ " 500 to 999 employees | \n",
+ " Designer | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 129 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 312
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#dimensions of the data\n",
+ "df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Pk727hn8BAtt",
+ "outputId": "03c5ea43-34a5-46ab-93ed-b475668820bc"
+ },
+ "execution_count": 313,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(99, 129)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 313
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#datatypes of the data\n",
+ "df.dtypes"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "MJa9xMhaBN0h",
+ "outputId": "7632a777-6524-4f89-8909-5245090ee4b8"
+ },
+ "execution_count": 314,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Respondent int64\n",
+ "Hobby object\n",
+ "OpenSource object\n",
+ "Country object\n",
+ "Student object\n",
+ " ... \n",
+ "Age object\n",
+ "Dependents object\n",
+ "MilitaryUS object\n",
+ "SurveyTooLong object\n",
+ "SurveyEasy object\n",
+ "Length: 129, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 314
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "tYgb3H0fBXSM",
+ "outputId": "22538fe5-35bd-40a6-c808-8fd6d49ae00e"
+ },
+ "execution_count": 315,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['Respondent', 'Hobby', 'OpenSource', 'Country', 'Student', 'Employment',\n",
+ " 'FormalEducation', 'UndergradMajor', 'CompanySize', 'DevType',\n",
+ " ...\n",
+ " 'Exercise', 'Gender', 'SexualOrientation', 'EducationParents',\n",
+ " 'RaceEthnicity', 'Age', 'Dependents', 'MilitaryUS', 'SurveyTooLong',\n",
+ " 'SurveyEasy'],\n",
+ " dtype='object', length=129)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 315
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**DATA** **CLEANING**"
+ ],
+ "metadata": {
+ "id": "JcsoF8mUC3FW"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#checking for the missing or null values\n",
+ "missing_values=df.isnull().sum()\n",
+ "missing_values"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Oq3xNHCbC0dw",
+ "outputId": "12906a48-860c-4cbe-adf4-1cabc51117c9"
+ },
+ "execution_count": 316,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Respondent 0\n",
+ "Hobby 0\n",
+ "OpenSource 0\n",
+ "Country 0\n",
+ "Student 1\n",
+ " ..\n",
+ "Age 32\n",
+ "Dependents 32\n",
+ "MilitaryUS 80\n",
+ "SurveyTooLong 32\n",
+ "SurveyEasy 32\n",
+ "Length: 129, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 316
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "By this we can get to the conclusion that there exist missing values"
+ ],
+ "metadata": {
+ "id": "2XZHexqvDe_l"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#calculating percentage of missing values\n",
+ "total_cells=np.product(df.shape)\n",
+ "total_missing=missing_values.sum()\n",
+ "percent=(total_missing/total_cells)*100"
+ ],
+ "metadata": {
+ "id": "GUvMjmK8Dbhf"
+ },
+ "execution_count": 317,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print('Total number of cells: ', total_cells)\n",
+ "print('Total number of missing values: ', total_missing)\n",
+ "print('Missing Percentage: ', percent, '%')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UzfvQT1kDxYW",
+ "outputId": "dcb0434c-2bf1-4793-dd52-252c79a37ea7"
+ },
+ "execution_count": 318,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Total number of cells: 12771\n",
+ "Total number of missing values: 4421\n",
+ "Missing Percentage: 34.61749275702764 %\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "approximately 35% of data is missing we need to handle the missing values"
+ ],
+ "metadata": {
+ "id": "wpCPQqvTEiPZ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**cleaning the gender column**"
+ ],
+ "metadata": {
+ "id": "JocVQX4iFe7X"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Calculate the missing percentage of the 'Gender' column\n",
+ "df['Gender'].isnull().sum()\n",
+ "missing_percentage_gender = (df['Gender'].isnull().sum() / len(df['Gender'])) * 100\n",
+ "\n",
+ "print(\"Missing percentage of the 'Gender' column:\", missing_percentage_gender, \"%\")\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "IsT298IwEf-8",
+ "outputId": "d4f6009b-6445-461c-dc19-d60749c532b4"
+ },
+ "execution_count": 319,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Missing percentage of the 'Gender' column: 32.323232323232325 %\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "32% of the gender data is missing lets categorize them"
+ ],
+ "metadata": {
+ "id": "j9z63fRMF_BT"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Gender'].unique()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qOL_vo90F812",
+ "outputId": "3574d5f6-b193-41f8-8f5f-8e8c00afcd4b"
+ },
+ "execution_count": 320,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['Male', nan, 'Female',\n",
+ " 'Female;Male;Transgender;Non-binary, genderqueer, or gender non-conforming',\n",
+ " 'Female;Male',\n",
+ " 'Male;Non-binary, genderqueer, or gender non-conforming'],\n",
+ " dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 320
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#counting the number of each gender\n",
+ "df.groupby('Gender')['Gender'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WThNvO9dGNJr",
+ "outputId": "67b1184e-d003-4cc8-c2da-9e54ab211126"
+ },
+ "execution_count": 321,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Gender\n",
+ "Female 5\n",
+ "Female;Male 1\n",
+ "Female;Male;Transgender;Non-binary, genderqueer, or gender non-conforming 1\n",
+ "Male 59\n",
+ "Male;Non-binary, genderqueer, or gender non-conforming 1\n",
+ "Name: Gender, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 321
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Filling the missing values and reducing the number of divisions to male,female and non confirmed only to get easy analysis"
+ ],
+ "metadata": {
+ "id": "N5mSGdaFHQ7C"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Gender'] = df['Gender'].fillna('Non-binary, genderqueer, or gender non-conforming')\n",
+ "df['Gender'].replace('Female;Male;Non-binary, genderqueer, or gender non-conforming', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Female;Male;Transgender;Non-binary, genderqueer, or gender non-conforming', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Female;Non-binary, genderqueer, or gender non-conforming', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Female;Transgender;Non-binary, genderqueer, or gender non-conforming', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Male;Non-binary, genderqueer, or gender non-conforming', 'Male', inplace =True)\n",
+ "df['Gender'].replace('Male;Transgender;Non-binary, genderqueer, or gender non-conforming', 'Male', inplace =True)\n",
+ "df['Gender'].replace('Transgender;Non-binary, genderqueer, or gender non-conforming', 'Non-conforming', inplace =True) ##not sure\n",
+ "df['Gender'].replace('Female;Male', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Female;Male;Transgender', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Female;Transgender', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Male;Transgender', 'Female', inplace =True)\n",
+ "df['Gender'].replace('Non-binary, genderqueer, or gender non-conforming', 'Non-conforming', inplace =True) #\n",
+ "df['Gender'].replace('Transgender', 'Male', inplace =True)"
+ ],
+ "metadata": {
+ "id": "mr6IrwZxGWvH"
+ },
+ "execution_count": 322,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('Gender')['Gender'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OyI5oGQJHPVt",
+ "outputId": "eb5c7510-581a-4ed5-aeae-92fe1d1d9e03"
+ },
+ "execution_count": 323,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Gender\n",
+ "Female 7\n",
+ "Male 60\n",
+ "Non-conforming 32\n",
+ "Name: Gender, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 323
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1wIFGzW0HcDS",
+ "outputId": "e6b9170c-4b38-4e2d-c810-ee55fa6065fe"
+ },
+ "execution_count": 324,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(99, 129)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 324
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.isnull().sum()['Gender']"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cEYOPjlNHw-I",
+ "outputId": "13653b63-e216-4716-f57b-dd606ae29425"
+ },
+ "execution_count": 325,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 325
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in gender column\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "6h37X59cH0ne"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**cleaning the country column**"
+ ],
+ "metadata": {
+ "id": "Y7lg2qNyIHRT"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('Country')['Country'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WUTBusH2HzGg",
+ "outputId": "7bb437a9-23ae-4b51-86b7-35dbd6102bc8"
+ },
+ "execution_count": 326,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Country\n",
+ "Algeria 1\n",
+ "Argentina 2\n",
+ "Australia 1\n",
+ "Belgium 1\n",
+ "Brazil 1\n",
+ "Bulgaria 1\n",
+ "Chile 1\n",
+ "China 2\n",
+ "Colombia 1\n",
+ "Croatia 1\n",
+ "Denmark 1\n",
+ "Dominican Republic 1\n",
+ "Finland 1\n",
+ "France 3\n",
+ "Germany 5\n",
+ "Greece 1\n",
+ "India 16\n",
+ "Indonesia 2\n",
+ "Ireland 1\n",
+ "Israel 1\n",
+ "Japan 1\n",
+ "Kenya 1\n",
+ "Latvia 1\n",
+ "Netherlands 1\n",
+ "Nigeria 1\n",
+ "Poland 3\n",
+ "Romania 1\n",
+ "Russian Federation 4\n",
+ "South Africa 1\n",
+ "Spain 2\n",
+ "Sweden 3\n",
+ "Ukraine 1\n",
+ "United Kingdom 7\n",
+ "United States 28\n",
+ "Name: Country, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 326
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Country'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ds9Hf2hgIn8d",
+ "outputId": "18299bd1-8228-486f-8c17-0854e534f060"
+ },
+ "execution_count": 327,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 327
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling null values\n",
+ "df['Country'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "8XoIM4LoIreR"
+ },
+ "execution_count": 328,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Country'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VvZcOHoAIv79",
+ "outputId": "f003101c-cdce-4fd1-a900-9f4321524f75"
+ },
+ "execution_count": 329,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 329
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in country column"
+ ],
+ "metadata": {
+ "id": "oMNRkkg8JKSP"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**cleaning the hobbies column**"
+ ],
+ "metadata": {
+ "id": "J34nhG7BJXUD"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Hobby'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3LXVD5rzJEuX",
+ "outputId": "b7595cf2-ccf7-4c56-d64d-15ef2dcb3277"
+ },
+ "execution_count": 330,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 330
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('Hobby')['Hobby'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "D-Z7I4JrJhRo",
+ "outputId": "00260786-ac06-41b9-8589-7206507c0d93"
+ },
+ "execution_count": 331,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Hobby\n",
+ "No 20\n",
+ "Yes 79\n",
+ "Name: Hobby, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 331
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "As the hobbies are categorized only to yes or not further we dont need to categorize them."
+ ],
+ "metadata": {
+ "id": "DIRvLcbQJ-qO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Hobby'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "k3_uu36PJ8Wm",
+ "outputId": "48319fc6-6cb7-49da-cf3f-2097c914531e"
+ },
+ "execution_count": 332,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 332
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in hobbies column"
+ ],
+ "metadata": {
+ "id": "rqM9IL9AKKbS"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**cleaning the undergradmajor column**"
+ ],
+ "metadata": {
+ "id": "lNfYL_EYKU10"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['UndergradMajor'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "P2xoCi7mKIgO",
+ "outputId": "c65fafb1-5933-4112-e2a1-bb5cea7a74fb"
+ },
+ "execution_count": 333,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "11"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 333
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['UndergradMajor'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "X_jhcUTzKdKE",
+ "outputId": "93b3d7ab-aee7-4690-f261-fd0a4ce3b097"
+ },
+ "execution_count": 334,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "UndergradMajor\n",
+ "Computer science, computer engineering, or software engineering 56\n",
+ "A natural science (ex. biology, chemistry, physics) 7\n",
+ "Another engineering discipline (ex. civil, electrical, mechanical) 7\n",
+ "A business discipline (ex. accounting, finance, marketing) 5\n",
+ "Fine arts or performing arts (ex. graphic design, music, studio art) 4\n",
+ "Information systems, information technology, or system administration 3\n",
+ "Mathematics or statistics 2\n",
+ "Web development or web design 2\n",
+ "A social science (ex. anthropology, psychology, political science) 1\n",
+ "A humanities discipline (ex. literature, history, philosophy) 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 334
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def refactor_major(df):\n",
+ " conditions_major = [(df['UndergradMajor'] == 'Computer science, computer engineering, or software engineering'),\n",
+ " (df['UndergradMajor'] == 'Another engineering discipline (ex. civil, electrical, mechanical)'),\n",
+ " (df['UndergradMajor'] == 'Information systems, information technology, or system administration'),\n",
+ " (df['UndergradMajor'] == 'Mathematics or statistics'),\n",
+ " (df['UndergradMajor'] == 'A natural science (ex. biology, chemistry, physics)')\n",
+ " |(df['UndergradMajor'] == 'A health science (ex. nursing, pharmacy, radiology)'),\n",
+ " (df['UndergradMajor'] == 'Web development or web design'),\n",
+ " (df['UndergradMajor'] == 'A business discipline (ex. accounting, finance, marketing)'),\n",
+ " (df['UndergradMajor'] == 'A humanities discipline (ex. literature, history, philosophy)')\n",
+ " | (df['UndergradMajor'] == 'A social science (ex. anthropology, psychology, political science)')\n",
+ " | (df['UndergradMajor'] == 'Fine arts or performing arts (ex. graphic design, music, studio art)'),\n",
+ " (df['UndergradMajor'] == 'I never declared a major') ]\n",
+ "\n",
+ " choices_major = ['Computer Science', 'Engineering', 'Info Systems', 'Math/Stat', 'Other Science',\n",
+ " 'Web Design/Dev', 'Business', 'Arts and Science', 'No major']\n",
+ " df['UndergradMajor'] = np.select(conditions_major, choices_major, default = np.NaN)\n",
+ " return df"
+ ],
+ "metadata": {
+ "id": "u3dNbovyKfnF"
+ },
+ "execution_count": 335,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "the values in the \"UndergradMajor\" column are classified based on certain conditions."
+ ],
+ "metadata": {
+ "id": "ilV2dwqCMcsM"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = refactor_major(df)"
+ ],
+ "metadata": {
+ "id": "CRTeAPqlKuzm"
+ },
+ "execution_count": 336,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['UndergradMajor'].replace('nan', 'No major', inplace=True)"
+ ],
+ "metadata": {
+ "id": "nHySDU3cM66h"
+ },
+ "execution_count": 337,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['UndergradMajor'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FDnUZGWMNJDA",
+ "outputId": "c4b541bb-d131-42e4-c403-41a3cc429609"
+ },
+ "execution_count": 338,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "UndergradMajor\n",
+ "Computer Science 56\n",
+ "No major 11\n",
+ "Other Science 7\n",
+ "Engineering 7\n",
+ "Arts and Science 6\n",
+ "Business 5\n",
+ "Info Systems 3\n",
+ "Math/Stat 2\n",
+ "Web Design/Dev 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 338
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['UndergradMajor'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7thqzYMnNPDu",
+ "outputId": "b5f4a0ee-bd6c-4718-e009-7274b1781291"
+ },
+ "execution_count": 339,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 339
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.dropna(subset=['UndergradMajor'], inplace=True)"
+ ],
+ "metadata": {
+ "id": "3VIx65RZNlNY"
+ },
+ "execution_count": 340,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['UndergradMajor'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "8DYmuQj-Nnoi",
+ "outputId": "db61126d-5275-4c87-bb1d-561aad75ccd3"
+ },
+ "execution_count": 341,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 341
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "As we assigned the remaining null values to the category of no major\n",
+ "Hence, there are no existing null values in the undergradmajor column"
+ ],
+ "metadata": {
+ "id": "ku9qgKCENWC6"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**cleaning the jobsearchstatus column**"
+ ],
+ "metadata": {
+ "id": "WL4YcRevN0PL"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['JobSearchStatus'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xzXzkzVsNUPH",
+ "outputId": "ea2cb69d-5a5f-4f94-ce6f-065a779ebc23"
+ },
+ "execution_count": 342,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "JobSearchStatus\n",
+ "I’m not actively looking, but I am open to new opportunities 54\n",
+ "I am not interested in new job opportunities 18\n",
+ "I am actively looking for a job 11\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 342
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#dropping null values\n",
+ "df.dropna(subset=['JobSearchStatus'], inplace=True)"
+ ],
+ "metadata": {
+ "id": "vszoQOfKNymE"
+ },
+ "execution_count": 343,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Categorizing those into groups: the ones who are seeking for job and the ones who are not seeking for job and others"
+ ],
+ "metadata": {
+ "id": "1ey3HKcNlSTt"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def refactor_job(df):\n",
+ " '''function to change JobStatus category to Seeking and Non Seeking'''\n",
+ "\n",
+ " conditions_job = [(df['JobSearchStatus'] == 'I am actively looking for a job'),\n",
+ " (df['JobSearchStatus'] == 'I am not interested in new job opportunities')\n",
+ " | (df['JobSearchStatus'] == 'I’m not actively looking, but I am open to new opportunities')]\n",
+ "\n",
+ " choices_job = ['Seeking', 'Not seeking']\n",
+ "\n",
+ " df['JobSearchStatus'] = np.select(conditions_job, choices_job, default=np.nan)\n",
+ "\n",
+ " return df\n"
+ ],
+ "metadata": {
+ "id": "1FiRCkUMlGYX"
+ },
+ "execution_count": 344,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = refactor_job(df)"
+ ],
+ "metadata": {
+ "id": "w6eyQW0KlfFA"
+ },
+ "execution_count": 345,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['JobSearchStatus'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4tjcirO5lhYz",
+ "outputId": "e3ce271a-c3c2-457b-de49-4d154e593a23"
+ },
+ "execution_count": 346,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "JobSearchStatus\n",
+ "nan 54\n",
+ "Not seeking 18\n",
+ "Seeking 11\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 346
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['JobSearchStatus'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rZhqXlToljTS",
+ "outputId": "d372befe-50bd-41d4-bc7e-2e577ec8e95b"
+ },
+ "execution_count": 347,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 347
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the jobsearchstatus column"
+ ],
+ "metadata": {
+ "id": "jEXRkQselt_c"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**cleaning the Employement column**"
+ ],
+ "metadata": {
+ "id": "dbccv75Pl7dH"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Employment'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HN5Hw7Jslsn2",
+ "outputId": "ef82359b-dfbf-446b-dc34-961671ca2f8e"
+ },
+ "execution_count": 348,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Employment\n",
+ "Employed full-time 77\n",
+ "Employed part-time 6\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 348
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Employment'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "9dB_IwQPmHfa",
+ "outputId": "3b662665-8036-4bc0-a248-844f8992309f"
+ },
+ "execution_count": 349,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 349
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Employment'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "e8soU9WymLEm"
+ },
+ "execution_count": 350,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Employment']"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Tc7hG1v4mULw",
+ "outputId": "31a3b1de-2e0b-415f-b632-35b613529343"
+ },
+ "execution_count": 351,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 Employed part-time\n",
+ "1 Employed full-time\n",
+ "2 Employed full-time\n",
+ "3 Employed full-time\n",
+ "4 Employed full-time\n",
+ " ... \n",
+ "93 Employed full-time\n",
+ "94 Employed full-time\n",
+ "95 Employed full-time\n",
+ "97 Employed full-time\n",
+ "98 Employed full-time\n",
+ "Name: Employment, Length: 83, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 351
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Employment'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "m4dwu9zemeg2",
+ "outputId": "53f90b6b-df50-4eca-8150-7da1c55cbf3c"
+ },
+ "execution_count": 352,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 352
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the employment column"
+ ],
+ "metadata": {
+ "id": "6sCzyYBnmuNR"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Job** **Satisfaction**"
+ ],
+ "metadata": {
+ "id": "gGEvRbeAmzSY"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['JobSatisfaction'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ZBwz9J5OmpJX",
+ "outputId": "1f85a202-801a-4250-97bb-a1271472fdd1"
+ },
+ "execution_count": 353,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "JobSatisfaction\n",
+ "Moderately satisfied 26\n",
+ "Slightly satisfied 17\n",
+ "Neither satisfied nor dissatisfied 11\n",
+ "Extremely satisfied 9\n",
+ "Slightly dissatisfied 9\n",
+ "Moderately dissatisfied 6\n",
+ "Extremely dissatisfied 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 353
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['JobSatisfaction'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GDnUpweIm66o",
+ "outputId": "ae74bb27-c72a-4582-ab7c-91f1b400648e"
+ },
+ "execution_count": 354,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 354
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling the null values\n",
+ "df['JobSatisfaction'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "eMrI20RNm_Am"
+ },
+ "execution_count": 355,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['JobSatisfaction'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_0Qg8298nCCC",
+ "outputId": "b2e1a61b-11ea-4d14-fe43-18072df757f2"
+ },
+ "execution_count": 356,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 356
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the job satisfaction column"
+ ],
+ "metadata": {
+ "id": "agn4hD9KnKLJ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Race Ethnicity**"
+ ],
+ "metadata": {
+ "id": "D_K5_TEFnUQX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('RaceEthnicity')['RaceEthnicity'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "y8BUpHtAnTrh",
+ "outputId": "078d0df5-3ab7-4cd0-bfd5-da23ac8f9b41"
+ },
+ "execution_count": 357,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "RaceEthnicity\n",
+ "Black or of African descent 3\n",
+ "Black or of African descent;East Asian;Hispanic or Latino/Latina;Middle Eastern;Native American, Pacific Islander, or Indigenous Australian;South Asian;White or of European descent 1\n",
+ "Black or of African descent;Hispanic or Latino/Latina 1\n",
+ "East Asian 2\n",
+ "Hispanic or Latino/Latina 1\n",
+ "Hispanic or Latino/Latina;White or of European descent 1\n",
+ "South Asian 8\n",
+ "White or of European descent 41\n",
+ "Name: RaceEthnicity, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 357
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "combining the given Ethnicity by str.match which means if each string starts with a match of a regular expression pattern then they will be categorized.\n"
+ ],
+ "metadata": {
+ "id": "0LS8bEhVn30b"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.loc[df['RaceEthnicity'].str.match('Biracial') == True, 'RaceEthnicity'] = 'Biracial'\n",
+ "df.loc[df['RaceEthnicity'].str.match('Black or of African descent') == True, 'RaceEthnicity'] = 'Black or African descent'\n",
+ "df.loc[df['RaceEthnicity'].str.match('East Asian') == True, 'RaceEthnicity'] = 'East Asian'\n",
+ "df.loc[df['RaceEthnicity'].str.match('Hispanic or Latino') == True, 'RaceEthnicity'] = 'Hispanic or Latino'\n",
+ "df.loc[df['RaceEthnicity'].str.match('Indigenous') == True, 'RaceEthnicity'] = 'Indigenous'\n",
+ "df.loc[df['RaceEthnicity'].str.match('Middle Eastern') == True, 'RaceEthnicity'] = 'Middle Eastern'\n",
+ "df.loc[df['RaceEthnicity'].str.match('South') == True, 'RaceEthnicity'] = 'South Asian'\n",
+ "df.loc[df['RaceEthnicity'].str.match('White or of European descent') == True, 'RaceEthnicity'] = 'White or European descent'\n",
+ "df.loc[df['RaceEthnicity'].str.match('Multiracial') == True, 'RaceEthnicity'] = 'Multiracial'\n",
+ "df.loc[df['RaceEthnicity'].str.match('Native American') == True, 'RaceEthnicity'] = 'Native American'"
+ ],
+ "metadata": {
+ "id": "FH4ji11wnIFd"
+ },
+ "execution_count": 358,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('RaceEthnicity')['RaceEthnicity'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "SJUNzLQJomx-",
+ "outputId": "d037337e-a06c-4fe3-a6ca-8f73724c4fca"
+ },
+ "execution_count": 359,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "RaceEthnicity\n",
+ "Black or African descent 5\n",
+ "East Asian 2\n",
+ "Hispanic or Latino 2\n",
+ "South Asian 8\n",
+ "White or European descent 41\n",
+ "Name: RaceEthnicity, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 359
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['RaceEthnicity'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "QIwYiTOSorvU",
+ "outputId": "3d976d23-f12f-45f9-fc7c-a42b55d8c962"
+ },
+ "execution_count": 360,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "25"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 360
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#This code performs forward and backward filling of missing values in the RaceEthnicity column based on the values in the Country column.\n",
+ "df['RaceEthnicity']=df.groupby(['Country'])['RaceEthnicity'].bfill().ffill()"
+ ],
+ "metadata": {
+ "id": "ko6zkKW4o5V2"
+ },
+ "execution_count": 361,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['RaceEthnicity'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "o_q1T5xRo9H3",
+ "outputId": "5c2a6aea-54d0-4dac-c128-26b92566658f"
+ },
+ "execution_count": 362,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 362
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the race ethnicity column"
+ ],
+ "metadata": {
+ "id": "ta2DJio-poxZ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**DevType**"
+ ],
+ "metadata": {
+ "id": "8DUVlql_pz1T"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['DevType'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vbxq2hzxpnep",
+ "outputId": "f2b953ec-7e86-4ec6-aba4-20aacafa8251"
+ },
+ "execution_count": 363,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 363
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['DevType'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "SYNuGwm-p-rh"
+ },
+ "execution_count": 364,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('DevType')['DevType'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6OQ3ubLwqA8k",
+ "outputId": "3ef03708-5247-46c4-c3c9-d3f188b4dfe9"
+ },
+ "execution_count": 365,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "DevType\n",
+ "Back-end developer 2\n",
+ "Back-end developer;C-suite executive (CEO, CTO, etc.);Data or business analyst;Database administrator;DevOps specialist;Engineering manager;Full-stack developer;System administrator 1\n",
+ "Back-end developer;C-suite executive (CEO, CTO, etc.);Database administrator;Designer;Front-end developer;Full-stack developer;Mobile developer 1\n",
+ "Back-end developer;C-suite executive (CEO, CTO, etc.);Database administrator;DevOps specialist;Engineering manager;Full-stack developer;System administrator 1\n",
+ "Back-end developer;Data or business analyst;Database administrator;DevOps specialist;Front-end developer;Full-stack developer;System administrator 1\n",
+ "Back-end developer;Data or business analyst;Designer;Front-end developer;Game or graphics developer;Mobile developer;Student 1\n",
+ "Back-end developer;Data scientist or machine learning specialist;Desktop or enterprise applications developer;Front-end developer;Full-stack developer 1\n",
+ "Back-end developer;Data scientist or machine learning specialist;DevOps specialist;Educator or academic researcher;QA or test developer;System administrator 1\n",
+ "Back-end developer;Data scientist or machine learning specialist;Full-stack developer;Game or graphics developer;Student 1\n",
+ "Back-end developer;Database administrator;Designer;Desktop or enterprise applications developer;Front-end developer;Full-stack developer 1\n",
+ "Back-end developer;Database administrator;Designer;Front-end developer;Full-stack developer;Mobile developer;System administrator 1\n",
+ "Back-end developer;Database administrator;Desktop or enterprise applications developer;DevOps specialist;Full-stack developer;QA or test developer 1\n",
+ "Back-end developer;Database administrator;DevOps specialist;Front-end developer;Full-stack developer;Mobile developer 1\n",
+ "Back-end developer;Database administrator;DevOps specialist;Front-end developer;Full-stack developer;QA or test developer;System administrator 1\n",
+ "Back-end developer;Database administrator;Front-end developer 1\n",
+ "Back-end developer;Database administrator;Front-end developer;Full-stack developer 1\n",
+ "Back-end developer;Database administrator;Front-end developer;Full-stack developer;Mobile developer 1\n",
+ "Back-end developer;Database administrator;Front-end developer;Student;System administrator 1\n",
+ "Back-end developer;Designer;Educator or academic researcher;Front-end developer 1\n",
+ "Back-end developer;Designer;Front-end developer;Full-stack developer;Marketing or sales professional;Mobile developer 1\n",
+ "Back-end developer;Desktop or enterprise applications developer;Embedded applications or devices developer 2\n",
+ "Back-end developer;Desktop or enterprise applications developer;Embedded applications or devices developer;Front-end developer;Full-stack developer;System administrator 1\n",
+ "Back-end developer;Desktop or enterprise applications developer;Front-end developer;Full-stack developer 2\n",
+ "Back-end developer;Desktop or enterprise applications developer;Front-end developer;Full-stack developer;Game or graphics developer;Student 1\n",
+ "Back-end developer;Desktop or enterprise applications developer;QA or test developer 1\n",
+ "Back-end developer;DevOps specialist 2\n",
+ "Back-end developer;DevOps specialist;Front-end developer;Full-stack developer;Mobile developer 1\n",
+ "Back-end developer;Embedded applications or devices developer 1\n",
+ "Back-end developer;Embedded applications or devices developer;Full-stack developer 1\n",
+ "Back-end developer;Engineering manager 1\n",
+ "Back-end developer;Front-end developer 1\n",
+ "Back-end developer;Front-end developer;Full-stack developer 5\n",
+ "Back-end developer;Front-end developer;Full-stack developer;Mobile developer 1\n",
+ "Back-end developer;Front-end developer;Student 2\n",
+ "Back-end developer;Full-stack developer 5\n",
+ "Back-end developer;Full-stack developer;QA or test developer 1\n",
+ "Back-end developer;Full-stack developer;System administrator 1\n",
+ "Data or business analyst;Data scientist or machine learning specialist;Database administrator;DevOps specialist 1\n",
+ "Data or business analyst;Database administrator;DevOps specialist;System administrator 1\n",
+ "Data or business analyst;Desktop or enterprise applications developer;Game or graphics developer;QA or test developer;Student 1\n",
+ "Data scientist or machine learning specialist 1\n",
+ "Database administrator;DevOps specialist;Full-stack developer;System administrator 1\n",
+ "Database administrator;Full-stack developer;Mobile developer 1\n",
+ "Designer;Front-end developer 2\n",
+ "Designer;Front-end developer;Marketing or sales professional 1\n",
+ "Designer;Front-end developer;QA or test developer 1\n",
+ "Desktop or enterprise applications developer;Embedded applications or devices developer;Full-stack developer;Game or graphics developer;Mobile developer 1\n",
+ "Desktop or enterprise applications developer;Front-end developer;Product manager 1\n",
+ "Embedded applications or devices developer 1\n",
+ "Embedded applications or devices developer;Engineering manager 1\n",
+ "Engineering manager;Full-stack developer 1\n",
+ "Engineering manager;Mobile developer 1\n",
+ "Front-end developer 1\n",
+ "Front-end developer;Full-stack developer 1\n",
+ "Front-end developer;Student 1\n",
+ "Full-stack developer 8\n",
+ "Full-stack developer;Product manager 1\n",
+ "Mobile developer 2\n",
+ "QA or test developer 1\n",
+ "Student 2\n",
+ "Name: DevType, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 365
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "combining the given DevType by str.match which means if each string starts with a match of a regular expression pattern then they will be categorized."
+ ],
+ "metadata": {
+ "id": "a7Vr5H6yqqKK"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#categorizing the given fields into developer,manager,student,non developer roles\n",
+ "df.loc[df['DevType'].str.match('Back-end developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Student') == True, 'DevType'] = 'Student'\n",
+ "df.loc[df['DevType'].str.match('QA or test developer') == True, 'DevType'] = 'Non developer'\n",
+ "df.loc[df['DevType'].str.match('Product manager') == True, 'DevType'] = 'Manager'\n",
+ "df.loc[df['DevType'].str.match('Mobile developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Marketing or sales professional') == True, 'DevType'] = 'Non developer'\n",
+ "df.loc[df['DevType'].str.match('System administrator') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Game or graphics developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Full-stack developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Front-end developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Engineering manager') == True, 'DevType'] = 'Manager'\n",
+ "df.loc[df['DevType'].str.match('Embedded applications or devices developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Educator or academic researcher') == True, 'DevType'] = 'Student'\n",
+ "df.loc[df['DevType'].str.match('DevOps specialist') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Desktop or enterprise applications developer') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Designer') == True, 'DevType'] = 'Non developer'\n",
+ "df.loc[df['DevType'].str.match('Database administrator') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Data scientist or machine learning specialist') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('Data or business analyst') == True, 'DevType'] = 'Developer'\n",
+ "df.loc[df['DevType'].str.match('C-suite executive') == True, 'DevType'] = 'Developer'\n"
+ ],
+ "metadata": {
+ "id": "yhGUOZDrqDSJ"
+ },
+ "execution_count": 366,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby('DevType')['DevType'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6Uq3rID4rLYy",
+ "outputId": "4c8efb0d-cc3d-4387-95e1-48f4f30a2b5d"
+ },
+ "execution_count": 367,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "DevType\n",
+ "Developer 74\n",
+ "Manager 2\n",
+ "Non developer 5\n",
+ "Student 2\n",
+ "Name: DevType, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 367
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Language worked with**"
+ ],
+ "metadata": {
+ "id": "0oM_tG12riOE"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#this return tha 3 most frequent entries or the 3 most used languages\n",
+ "df['LanguageWorkedWith'].value_counts().nlargest(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "9XA6DoJOrd1F",
+ "outputId": "99d22f2c-c3d7-48d3-ac97-571456cc99b5"
+ },
+ "execution_count": 368,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "LanguageWorkedWith\n",
+ "Java;JavaScript;PHP;SQL;TypeScript;HTML;CSS 2\n",
+ "JavaScript;PHP;HTML;CSS 2\n",
+ "C;F#;Haskell;Python;Scala 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 368
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['LanguageWorkedWith'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Bzonouh4rnpz",
+ "outputId": "53da6a72-157f-420a-e4f2-a064ca539653"
+ },
+ "execution_count": 369,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "14"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 369
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling missing values\n",
+ "df['LanguageWorkedWith'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "AnfM5PJJsEow"
+ },
+ "execution_count": 370,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['LanguageWorkedWith'].value_counts().nlargest(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PGdafg4HsG7-",
+ "outputId": "5682bd11-a8b2-4066-f81d-00e9b2c9222e"
+ },
+ "execution_count": 371,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "LanguageWorkedWith\n",
+ "Java;JavaScript;PHP;VB.NET;HTML;CSS 3\n",
+ "Assembly;C;C++;Java;Python;Delphi/Object Pascal 2\n",
+ "Java;JavaScript;PHP;SQL;TypeScript;HTML;CSS 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 371
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['LanguageWorkedWith'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "EKmOpf2TsP6n",
+ "outputId": "2542bcf8-824e-47f8-cc8e-37cad167f562"
+ },
+ "execution_count": 372,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 372
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the language work with column"
+ ],
+ "metadata": {
+ "id": "BcCtC4FesbbH"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Language desirednext year**"
+ ],
+ "metadata": {
+ "id": "WqnvC05gsybz"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#this return tha 3 most frequent entries that may appear the next year.\n",
+ "df['LanguageDesireNextYear'].value_counts().nlargest(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "I4UcToECsTjv",
+ "outputId": "60d1d194-fead-45ec-f067-1d1f96430f67"
+ },
+ "execution_count": 373,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "LanguageDesireNextYear\n",
+ "C#;JavaScript;PHP;SQL;HTML;CSS 2\n",
+ "JavaScript;Python;HTML;CSS 1\n",
+ "C#;JavaScript;TypeScript 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 373
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['LanguageDesireNextYear'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "XUG2NWJ8s4H8",
+ "outputId": "aa0c3203-aad2-4e6e-cd0f-2216fc15a3f4"
+ },
+ "execution_count": 374,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "18"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 374
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling missing values\n",
+ "df['LanguageDesireNextYear'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "y2uYLmpxtFcn"
+ },
+ "execution_count": 375,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['LanguageDesireNextYear'].value_counts().nlargest(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "DLb6YvEBtMSf",
+ "outputId": "a3d87f6a-b350-452f-b2a8-ddb502f2b0a4"
+ },
+ "execution_count": 376,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "LanguageDesireNextYear\n",
+ "Java;Python 4\n",
+ "JavaScript;PHP;SQL;Swift;CSS;Bash/Shell 2\n",
+ "C#;Java;JavaScript;Ruby;TypeScript;HTML;CSS 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 376
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['LanguageDesireNextYear'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2wEXX5jztUPN",
+ "outputId": "a81162b7-a9e1-406d-a9d8-2fba4d534fcf"
+ },
+ "execution_count": 377,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 377
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the language Desire next year column"
+ ],
+ "metadata": {
+ "id": "IOB02QJptbuC"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Yearscodingprof**"
+ ],
+ "metadata": {
+ "id": "DpIvYka_tl7Y"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#this returns the the top 10 most coding proficient ones\n",
+ "df['YearsCodingProf'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HTVOvj1itYyG",
+ "outputId": "b2ec5606-cdfb-4305-9fe7-c036c643a6e4"
+ },
+ "execution_count": 378,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "YearsCodingProf\n",
+ "3-5 years 26\n",
+ "0-2 years 20\n",
+ "6-8 years 12\n",
+ "9-11 years 9\n",
+ "12-14 years 6\n",
+ "18-20 years 3\n",
+ "21-23 years 2\n",
+ "24-26 years 2\n",
+ "15-17 years 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 378
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCodingProf'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Hptn0b6Ut3kL",
+ "outputId": "fd7a9557-27e1-4839-b1c4-f58462269422"
+ },
+ "execution_count": 379,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "2"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 379
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCodingProf'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "xI1uiR84t-AE"
+ },
+ "execution_count": 380,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCodingProf'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OOeODKlBuI6-",
+ "outputId": "6cdd40b4-63fa-4716-8821-5e802cbceca6"
+ },
+ "execution_count": 381,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "YearsCodingProf\n",
+ "3-5 years 27\n",
+ "0-2 years 21\n",
+ "6-8 years 12\n",
+ "9-11 years 9\n",
+ "12-14 years 6\n",
+ "18-20 years 3\n",
+ "21-23 years 2\n",
+ "24-26 years 2\n",
+ "15-17 years 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 381
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCodingProf'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iGiq0aEouASu",
+ "outputId": "1a89b314-4306-4413-a9f0-397d1878f586"
+ },
+ "execution_count": 382,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 382
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the yearscodingprof column"
+ ],
+ "metadata": {
+ "id": "cjOfpbY4uOjB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#the top 10 years coding ones\n",
+ "df['YearsCoding'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mWblz6_9uCZR",
+ "outputId": "92d0664e-6ec6-4cf9-a37d-333c8b8c389e"
+ },
+ "execution_count": 383,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "YearsCoding\n",
+ "6-8 years 17\n",
+ "3-5 years 15\n",
+ "9-11 years 15\n",
+ "0-2 years 11\n",
+ "15-17 years 7\n",
+ "12-14 years 6\n",
+ "24-26 years 5\n",
+ "18-20 years 4\n",
+ "30 or more years 3\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 383
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCoding'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Q5hqJopHuWkK",
+ "outputId": "9f03a6d2-f2b2-40bb-d7a6-0acd803e8754"
+ },
+ "execution_count": 384,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 384
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling missing values\n",
+ "df['YearsCoding'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "t_c4jz26ug5q"
+ },
+ "execution_count": 385,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCoding'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "CtrovjDBuxcb",
+ "outputId": "79854d29-edde-4741-ee14-2fa9858dff36"
+ },
+ "execution_count": 386,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "YearsCoding\n",
+ "6-8 years 17\n",
+ "3-5 years 15\n",
+ "9-11 years 15\n",
+ "0-2 years 11\n",
+ "15-17 years 7\n",
+ "12-14 years 6\n",
+ "24-26 years 5\n",
+ "18-20 years 4\n",
+ "30 or more years 3\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 386
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['YearsCoding'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ZB_36JfBukFp",
+ "outputId": "f287f7b2-e239-47cc-8eae-d43dd973e00a"
+ },
+ "execution_count": 387,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 387
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the yearscoding column"
+ ],
+ "metadata": {
+ "id": "A7C6tWmJu2sG"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Operating System**"
+ ],
+ "metadata": {
+ "id": "TnxIzjxCu6QO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#top 10 most used operating systems\n",
+ "df['OperatingSystem'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yj_CIgyOuqL5",
+ "outputId": "31beeb33-d817-402d-b2f9-37ae3bf82fc8"
+ },
+ "execution_count": 388,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "OperatingSystem\n",
+ "Windows 32\n",
+ "MacOS 20\n",
+ "Linux-based 15\n",
+ "BSD/Unix 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 388
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['OperatingSystem'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GfIBSbjNu-6f",
+ "outputId": "8aba7700-6325-4abd-ab74-ab019752df18"
+ },
+ "execution_count": 389,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "15"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 389
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling values\n",
+ "df['OperatingSystem'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "HEz3Vxr9vPcu"
+ },
+ "execution_count": 390,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['OperatingSystem'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "oeSMf-SIvTwZ",
+ "outputId": "b317c4bd-dd9e-4864-fecd-fb891410158b"
+ },
+ "execution_count": 391,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "OperatingSystem\n",
+ "Windows 44\n",
+ "MacOS 21\n",
+ "Linux-based 17\n",
+ "BSD/Unix 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 391
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['OperatingSystem'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FofzEDQkvWll",
+ "outputId": "cff0fdf4-6da6-4e53-d5e8-c10f88e4af62"
+ },
+ "execution_count": 392,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 392
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the operating system column"
+ ],
+ "metadata": {
+ "id": "cCjVGXBtwNrC"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Salary Type**"
+ ],
+ "metadata": {
+ "id": "9BviI_lAwZpR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#top 10 frequently used salary types\n",
+ "df['SalaryType'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "EjK1EthMvZmc",
+ "outputId": "7340b9a7-8471-4cab-9175-5b0f61725a9a"
+ },
+ "execution_count": 393,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "SalaryType\n",
+ "Monthly 25\n",
+ "Yearly 22\n",
+ "Weekly 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 393
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['SalaryType'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "P5qYCnc2wn4T",
+ "outputId": "2cf3e054-7fa2-4d65-ace6-15b46f2459b6"
+ },
+ "execution_count": 394,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "35"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 394
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['SalaryType'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "-x9tHjLQwq5t"
+ },
+ "execution_count": 395,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['SalaryType'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YnB1-anYws9D",
+ "outputId": "3564a274-28d3-44ae-bf06-d30f07d4e532"
+ },
+ "execution_count": 396,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "SalaryType\n",
+ "Monthly 42\n",
+ "Yearly 39\n",
+ "Weekly 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 396
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['SalaryType'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "r_EK-CI5wvUc",
+ "outputId": "dcc27086-90ae-4049-bf85-04a84a1870fc"
+ },
+ "execution_count": 397,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 397
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the salary type column"
+ ],
+ "metadata": {
+ "id": "cSybCP-aw5LC"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Currency**"
+ ],
+ "metadata": {
+ "id": "WQmFdwV3w-_L"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#top 10 used currency\n",
+ "df['Currency'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "g21xj2qGwycm",
+ "outputId": "200d8225-0dd4-4f08-9afc-8673ce4995f4"
+ },
+ "execution_count": 398,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Currency\n",
+ "U.S. dollars ($) 20\n",
+ "Euros (€) 11\n",
+ "British pounds sterling (£) 7\n",
+ "Indian rupees (‚Çπ) 7\n",
+ "Swedish kroner (SEK) 3\n",
+ "Russian rubles (‚ÇΩ) 3\n",
+ "Polish złoty (zł) 2\n",
+ "Chinese yuan renminbi (¥) 2\n",
+ "South African rands (R) 1\n",
+ "Australian dollars (A$) 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 398
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Currency'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "CFcuvPTWxIO_",
+ "outputId": "29f8c44e-0dfa-4165-d934-0bded39453ee"
+ },
+ "execution_count": 399,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "23"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 399
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Currency'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "tRct3FX2xLlp"
+ },
+ "execution_count": 400,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.dropna(subset=['Currency'], inplace = True)"
+ ],
+ "metadata": {
+ "id": "siI0zCLQxN2E"
+ },
+ "execution_count": 401,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Currency'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "slvL5FMsxWCx",
+ "outputId": "f7b9d9bb-9f36-4d11-a8f4-d63b4ecb7fe7"
+ },
+ "execution_count": 402,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Currency\n",
+ "U.S. dollars ($) 27\n",
+ "Euros (€) 16\n",
+ "British pounds sterling (£) 10\n",
+ "Indian rupees (‚Çπ) 8\n",
+ "Chinese yuan renminbi (¥) 5\n",
+ "Swedish kroner (SEK) 4\n",
+ "Russian rubles (‚ÇΩ) 4\n",
+ "Polish złoty (zł) 2\n",
+ "Brazilian reais (R$) 2\n",
+ "South African rands (R) 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 402
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Currency'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5A2t9JscxYgE",
+ "outputId": "9f0a16f8-b361-4fba-c59f-3105e38ff5df"
+ },
+ "execution_count": 403,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 403
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the currency column"
+ ],
+ "metadata": {
+ "id": "1SwoIy1nxclx"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "renaming the column for our convinience"
+ ],
+ "metadata": {
+ "id": "5ZN2w7IQ6oDj"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "col=['Age','ConvertedSalary','Country','Currency','DevType','Employment','RaceEthnicity','Gender','SalaryType','Hobby','JobSatisfaction','JobSearchStatus','OperatingSystem','UndergradMajor','YearsCoding','YearsCodingProf','LanguageDesireNextYear','LanguageWorkedWith','FormalEducation']\n",
+ "df1=df[col]\n",
+ "#renaming the coloumn\n",
+ "# 'ConvertedSalary': 'SalaryUSD'\n",
+ "df.rename(columns={'ConvertedSalary': 'SalaryUSD' }, inplace =True)\n",
+ "df.sort_index(axis=1).head(2)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 248
+ },
+ "id": "RYdkvEksxbOA",
+ "outputId": "eff4eb85-085e-455f-c511-45b43b0a6806"
+ },
+ "execution_count": 404,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " AIDangerous \\\n",
+ "1 Increasing automation of jobs \n",
+ "2 NaN \n",
+ "\n",
+ " AIFuture \\\n",
+ "1 I'm excited about the possibilities more than ... \n",
+ "2 NaN \n",
+ "\n",
+ " AIInteresting \\\n",
+ "1 Increasing automation of jobs \n",
+ "2 NaN \n",
+ "\n",
+ " AIResponsible AdBlocker AdBlockerDisable \\\n",
+ "1 The developers or the people creating the AI Yes Yes \n",
+ "2 NaN NaN NaN \n",
+ "\n",
+ " AdBlockerReasons AdsActions \\\n",
+ "1 The website I was visiting asked me to disable it NaN \n",
+ "2 NaN NaN \n",
+ "\n",
+ " AdsAgreeDisagree1 AdsAgreeDisagree2 ... SurveyEasy \\\n",
+ "1 Somewhat agree Neither agree nor disagree ... Somewhat easy \n",
+ "2 NaN NaN ... NaN \n",
+ "\n",
+ " SurveyTooLong TimeAfterBootcamp \\\n",
+ "1 The survey was an appropriate length NaN \n",
+ "2 NaN NaN \n",
+ "\n",
+ " TimeFullyProductive UndergradMajor UpdateCV \\\n",
+ "1 One to three months Other Science I saw an employer’s advertisement \n",
+ "2 NaN Computer Science NaN \n",
+ "\n",
+ " VersionControl WakeTime YearsCoding YearsCodingProf \n",
+ "1 Git;Subversion Between 6:01 - 7:00 AM 30 or more years 18-20 years \n",
+ "2 NaN NaN 24-26 years 6-8 years \n",
+ "\n",
+ "[2 rows x 129 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AIDangerous | \n",
+ " AIFuture | \n",
+ " AIInteresting | \n",
+ " AIResponsible | \n",
+ " AdBlocker | \n",
+ " AdBlockerDisable | \n",
+ " AdBlockerReasons | \n",
+ " AdsActions | \n",
+ " AdsAgreeDisagree1 | \n",
+ " AdsAgreeDisagree2 | \n",
+ " ... | \n",
+ " SurveyEasy | \n",
+ " SurveyTooLong | \n",
+ " TimeAfterBootcamp | \n",
+ " TimeFullyProductive | \n",
+ " UndergradMajor | \n",
+ " UpdateCV | \n",
+ " VersionControl | \n",
+ " WakeTime | \n",
+ " YearsCoding | \n",
+ " YearsCodingProf | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " Increasing automation of jobs | \n",
+ " I'm excited about the possibilities more than ... | \n",
+ " Increasing automation of jobs | \n",
+ " The developers or the people creating the AI | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " The website I was visiting asked me to disable it | \n",
+ " NaN | \n",
+ " Somewhat agree | \n",
+ " Neither agree nor disagree | \n",
+ " ... | \n",
+ " Somewhat easy | \n",
+ " The survey was an appropriate length | \n",
+ " NaN | \n",
+ " One to three months | \n",
+ " Other Science | \n",
+ " I saw an employer’s advertisement | \n",
+ " Git;Subversion | \n",
+ " Between 6:01 - 7:00 AM | \n",
+ " 30 or more years | \n",
+ " 18-20 years | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Computer Science | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 24-26 years | \n",
+ " 6-8 years | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 129 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe"
+ }
+ },
+ "metadata": {},
+ "execution_count": 404
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#top 10 salaries\n",
+ "df['SalaryUSD'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iOeLpbNu5xYD",
+ "outputId": "dd2640ff-11dd-48e2-cc5c-171ac50e6cf5"
+ },
+ "execution_count": 405,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "SalaryUSD\n",
+ "120000.0 3\n",
+ "30000.0 2\n",
+ "115000.0 2\n",
+ "70841.0 1\n",
+ "36000.0 1\n",
+ "90000.0 1\n",
+ "73428.0 1\n",
+ "128507.0 1\n",
+ "13212.0 1\n",
+ "48955.0 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 405
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['SalaryUSD'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ViidUYaW62ha",
+ "outputId": "a3e892eb-025e-4ae9-93ea-f59615f1d50b"
+ },
+ "execution_count": 406,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "36"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 406
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#grouping the salary based on the devtype and their country\n",
+ "mean_salary = df.groupby(['DevType','Country'])['SalaryUSD'].mean()"
+ ],
+ "metadata": {
+ "id": "1PuVMW007JQx"
+ },
+ "execution_count": 407,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mean_salary.nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "B14qBmwx7Mm2",
+ "outputId": "a5591844-7ea4-4d55-97da-162ed8f47273"
+ },
+ "execution_count": 408,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "DevType Country \n",
+ "Developer United States 146687.5\n",
+ " Ireland 128507.0\n",
+ "Non developer India 123984.0\n",
+ "Developer Australia 95968.0\n",
+ " Colombia 64116.0\n",
+ " Germany 61191.5\n",
+ " Sweden 60257.5\n",
+ " China 52604.0\n",
+ " Greece 51408.0\n",
+ " United Kingdom 48144.8\n",
+ "Name: SalaryUSD, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 408
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#grouping the salary based on the devtype and their country and results the mean for each\n",
+ "means = df.groupby(['YearsCodingProf','DevType', 'Country'])['SalaryUSD'].transform('mean')"
+ ],
+ "metadata": {
+ "id": "vyFU-a0q7hYG"
+ },
+ "execution_count": 409,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#filling the null values with the mean values\n",
+ "df['SalaryUSD'] = df['SalaryUSD'].fillna(means)"
+ ],
+ "metadata": {
+ "id": "JZiz2R937nDK"
+ },
+ "execution_count": 410,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#grouping the mean salry based on the yearscodingprof,devtype and country columns\n",
+ "mean_salary = df.groupby(['YearsCodingProf','DevType','Country'])['SalaryUSD'].mean()"
+ ],
+ "metadata": {
+ "id": "Pi_YcfwD8FK5"
+ },
+ "execution_count": 411,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mean_salary.nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "aNWX4Zok8MRy",
+ "outputId": "a731efc7-6212-4834-97fc-d1cadd91b98d"
+ },
+ "execution_count": 412,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "YearsCodingProf DevType Country \n",
+ "21-23 years Developer United States 250000.000000\n",
+ "0-2 years Developer United States 244000.000000\n",
+ "15-17 years Developer Ireland 128507.000000\n",
+ "0-2 years Non developer India 123984.000000\n",
+ "9-11 years Developer United States 115000.000000\n",
+ "12-14 years Developer Australia 95968.000000\n",
+ "18-20 years Developer United States 95000.000000\n",
+ "6-8 years Developer United States 91333.333333\n",
+ "3-5 years Developer China 85708.000000\n",
+ "9-11 years Developer United Kingdom 82648.000000\n",
+ "Name: SalaryUSD, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 412
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.dropna(subset=['SalaryUSD'], inplace = True)"
+ ],
+ "metadata": {
+ "id": "RV4purXk8YwP"
+ },
+ "execution_count": 413,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**AGE**"
+ ],
+ "metadata": {
+ "id": "1zhbNy2S8cf3"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#frequently used top 10 ages\n",
+ "df['Age'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NO728mSY8bEC",
+ "outputId": "95f51e92-f33d-47c8-a35a-2293d95d1481"
+ },
+ "execution_count": 414,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Age\n",
+ "25 - 34 years old 22\n",
+ "35 - 44 years old 13\n",
+ "18 - 24 years old 13\n",
+ "45 - 54 years old 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 414
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Age'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "SGAuNMgn8muK",
+ "outputId": "c9464ee6-3eba-4a8a-d7f6-60c4308f5aa7"
+ },
+ "execution_count": 415,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "10"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 415
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Age'].fillna(method='ffill', inplace=True)"
+ ],
+ "metadata": {
+ "id": "olP6Db5J8pbD"
+ },
+ "execution_count": 416,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Age'].value_counts().nlargest(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Bk5V6oIM8u3z",
+ "outputId": "2add576a-b414-4826-a997-15f1a077178a"
+ },
+ "execution_count": 417,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Age\n",
+ "25 - 34 years old 26\n",
+ "35 - 44 years old 16\n",
+ "18 - 24 years old 16\n",
+ "45 - 54 years old 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 417
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Age'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yRa2gNv68riy",
+ "outputId": "119b440a-d807-4bb1-e825-e22e1c4c381f"
+ },
+ "execution_count": 418,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 418
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in age column"
+ ],
+ "metadata": {
+ "id": "qsC_39Qo83__"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Formal Education**\n"
+ ],
+ "metadata": {
+ "id": "QZEkje_V9AGJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['FormalEducation'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NK5Ynajx8vzu",
+ "outputId": "008556a6-7826-4f32-c7f0-6315a3196cf3"
+ },
+ "execution_count": 419,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 419
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['FormalEducation'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ngTqWgm09HYP",
+ "outputId": "26c46333-913f-43b1-b41e-a86d4a69f275"
+ },
+ "execution_count": 420,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "FormalEducation\n",
+ "Bachelor’s degree (BA, BS, B.Eng., etc.) 25\n",
+ "Master’s degree (MA, MS, M.Eng., MBA, etc.) 17\n",
+ "Some college/university study without earning a degree 12\n",
+ "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.) 3\n",
+ "Associate degree 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 420
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Changing column's name for our convinienccce\n",
+ "df.rename(columns={'FormalEducation':'EdLevel'}, inplace =True)"
+ ],
+ "metadata": {
+ "id": "3eoaxYJb9Jo0"
+ },
+ "execution_count": 421,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Categorizing the given fields into few optimized fields"
+ ],
+ "metadata": {
+ "id": "NgV5cbDd9Xx9"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def refactor_ed(df):\n",
+ " '''function to change Education level category to Bachelors, Masters, Professional, Associate, Doctorate, No Degree'''\n",
+ " conditions_ed = [(df['EdLevel'] == 'Associate degree'),\n",
+ " (df['EdLevel'] == 'Bachelor’s degree (BA, BS, B.Eng., etc.)'),\n",
+ " (df['EdLevel'] == 'Master’s degree (MA, MS, M.Eng., MBA, etc.)'),\n",
+ " (df['EdLevel'] == 'Professional degree (JD, MD, etc.)'),\n",
+ " (df['EdLevel'] == 'Other doctoral degree (Ph.D, Ed.D., etc.)'),\n",
+ " (df['EdLevel'] == 'Some college/university study without earning a degree')\n",
+ " | (df['EdLevel'] == 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)')\n",
+ " | (df['EdLevel'] == 'Primary/elementary school')\n",
+ " | (df['EdLevel'] == 'I never completed any formal education')]\n",
+ "\n",
+ " choices_ed = ['Associate', 'Bachelors', 'Masters', 'Professional', 'Doctorate', 'No Degree']\n",
+ " df['EdLevel'] = np.select(conditions_ed, choices_ed, default = np.NaN)\n",
+ " return df"
+ ],
+ "metadata": {
+ "id": "lhdFv5Ym9QIX"
+ },
+ "execution_count": 422,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = refactor_ed(df)"
+ ],
+ "metadata": {
+ "id": "NewXTlW79TrK"
+ },
+ "execution_count": 423,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Assigining the surveyors who havent mentioned their education level to Bachelor’s degree\n",
+ "df['EdLevel'].replace('nan', 'Bachelors', inplace=True)"
+ ],
+ "metadata": {
+ "id": "i-KubSAV9odW"
+ },
+ "execution_count": 424,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['EdLevel'].value_counts()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "JTnp5YHk9rjH",
+ "outputId": "fa7b8ca3-c5be-4561-b5fd-1a96c6709531"
+ },
+ "execution_count": 425,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "EdLevel\n",
+ "Bachelors 42\n",
+ "No Degree 15\n",
+ "Associate 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 425
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['EdLevel'].isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "pifoZPD19z-2",
+ "outputId": "64c23e01-d963-4424-fad6-87aa334e1e74"
+ },
+ "execution_count": 426,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 426
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, there are no existing null values in the edlevel column"
+ ],
+ "metadata": {
+ "id": "0AywkL0v9y-I"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "*So far We cleaned a lot of data. Therefore we can check the percent again*"
+ ],
+ "metadata": {
+ "id": "rSs6cDdi-LBn"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "cleaned_2018 = df[df.notnull()]"
+ ],
+ "metadata": {
+ "id": "uyXEowD39uAh"
+ },
+ "execution_count": 427,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "cleaned_2018.head(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 933
+ },
+ "id": "uK6hUeN_9xsD",
+ "outputId": "326fc3aa-14fb-4a5e-de46-b0ad8076036c"
+ },
+ "execution_count": 428,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Respondent Hobby OpenSource Country Student \\\n",
+ "1 3 Yes Yes United Kingdom No \n",
+ "4 7 Yes No South Africa Yes, part-time \n",
+ "5 8 Yes No United Kingdom No \n",
+ "6 9 Yes Yes United States No \n",
+ "8 11 Yes Yes United States No \n",
+ "13 20 No No India No \n",
+ "14 21 No No Netherlands Yes, full-time \n",
+ "17 27 Yes No Sweden No \n",
+ "18 29 Yes Yes India Yes, full-time \n",
+ "20 33 Yes Yes Australia No \n",
+ "\n",
+ " Employment EdLevel UndergradMajor CompanySize \\\n",
+ "1 Employed full-time Bachelors Other Science 10,000 or more employees \n",
+ "4 Employed full-time No Degree Computer Science 10,000 or more employees \n",
+ "5 Employed full-time Bachelors Computer Science 10 to 19 employees \n",
+ "6 Employed full-time No Degree Computer Science 10,000 or more employees \n",
+ "8 Employed full-time No Degree Arts and Science 100 to 499 employees \n",
+ "13 Employed full-time Bachelors Engineering 20 to 99 employees \n",
+ "14 Employed full-time No Degree No major 20 to 99 employees \n",
+ "17 Employed full-time Bachelors Business 10 to 19 employees \n",
+ "18 Employed full-time Bachelors No major 10,000 or more employees \n",
+ "20 Employed full-time Bachelors Engineering 1,000 to 4,999 employees \n",
+ "\n",
+ " DevType ... Exercise Gender \\\n",
+ "1 Developer ... Daily or almost every day Male \n",
+ "4 Developer ... 3 - 4 times per week Male \n",
+ "5 Developer ... 1 - 2 times per week Male \n",
+ "6 Developer ... I don't typically exercise Male \n",
+ "8 Developer ... I don't typically exercise Male \n",
+ "13 Developer ... I don't typically exercise Non-conforming \n",
+ "14 Developer ... Daily or almost every day Male \n",
+ "17 Developer ... 3 - 4 times per week Male \n",
+ "18 Developer ... Daily or almost every day Female \n",
+ "20 Developer ... 3 - 4 times per week Male \n",
+ "\n",
+ " SexualOrientation \\\n",
+ "1 Straight or heterosexual \n",
+ "4 Straight or heterosexual \n",
+ "5 Straight or heterosexual \n",
+ "6 Straight or heterosexual \n",
+ "8 Straight or heterosexual \n",
+ "13 NaN \n",
+ "14 NaN \n",
+ "17 NaN \n",
+ "18 NaN \n",
+ "20 Straight or heterosexual \n",
+ "\n",
+ " EducationParents \\\n",
+ "1 Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "4 Some college/university study without earning ... \n",
+ "5 Secondary school (e.g. American high school, G... \n",
+ "6 Master’s degree (MA, MS, M.Eng., MBA, etc.) \n",
+ "8 Some college/university study without earning ... \n",
+ "13 NaN \n",
+ "14 Associate degree \n",
+ "17 Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "18 Some college/university study without earning ... \n",
+ "20 Bachelor’s degree (BA, BS, B.Eng., etc.) \n",
+ "\n",
+ " RaceEthnicity Age Dependents MilitaryUS \\\n",
+ "1 White or European descent 35 - 44 years old Yes NaN \n",
+ "4 White or European descent 18 - 24 years old Yes NaN \n",
+ "5 White or European descent 18 - 24 years old No NaN \n",
+ "6 White or European descent 18 - 24 years old No No \n",
+ "8 White or European descent 35 - 44 years old Yes No \n",
+ "13 South Asian 35 - 44 years old NaN NaN \n",
+ "14 White or European descent 18 - 24 years old No NaN \n",
+ "17 White or European descent 35 - 44 years old Yes NaN \n",
+ "18 South Asian 35 - 44 years old NaN NaN \n",
+ "20 South Asian 35 - 44 years old Yes NaN \n",
+ "\n",
+ " SurveyTooLong SurveyEasy \n",
+ "1 The survey was an appropriate length Somewhat easy \n",
+ "4 The survey was an appropriate length Somewhat easy \n",
+ "5 The survey was an appropriate length Somewhat easy \n",
+ "6 The survey was an appropriate length Somewhat easy \n",
+ "8 The survey was an appropriate length Very easy \n",
+ "13 NaN NaN \n",
+ "14 The survey was an appropriate length Neither easy nor difficult \n",
+ "17 The survey was too long Somewhat difficult \n",
+ "18 The survey was too long Very difficult \n",
+ "20 The survey was too long Neither easy nor difficult \n",
+ "\n",
+ "[10 rows x 129 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Respondent | \n",
+ " Hobby | \n",
+ " OpenSource | \n",
+ " Country | \n",
+ " Student | \n",
+ " Employment | \n",
+ " EdLevel | \n",
+ " UndergradMajor | \n",
+ " CompanySize | \n",
+ " DevType | \n",
+ " ... | \n",
+ " Exercise | \n",
+ " Gender | \n",
+ " SexualOrientation | \n",
+ " EducationParents | \n",
+ " RaceEthnicity | \n",
+ " Age | \n",
+ " Dependents | \n",
+ " MilitaryUS | \n",
+ " SurveyTooLong | \n",
+ " SurveyEasy | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United Kingdom | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelors | \n",
+ " Other Science | \n",
+ " 10,000 or more employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " Daily or almost every day | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " White or European descent | \n",
+ " 35 - 44 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 7 | \n",
+ " Yes | \n",
+ " No | \n",
+ " South Africa | \n",
+ " Yes, part-time | \n",
+ " Employed full-time | \n",
+ " No Degree | \n",
+ " Computer Science | \n",
+ " 10,000 or more employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " 3 - 4 times per week | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Some college/university study without earning ... | \n",
+ " White or European descent | \n",
+ " 18 - 24 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 8 | \n",
+ " Yes | \n",
+ " No | \n",
+ " United Kingdom | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelors | \n",
+ " Computer Science | \n",
+ " 10 to 19 employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " 1 - 2 times per week | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Secondary school (e.g. American high school, G... | \n",
+ " White or European descent | \n",
+ " 18 - 24 years old | \n",
+ " No | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 9 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United States | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " No Degree | \n",
+ " Computer Science | \n",
+ " 10,000 or more employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " I don't typically exercise | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Master’s degree (MA, MS, M.Eng., MBA, etc.) | \n",
+ " White or European descent | \n",
+ " 18 - 24 years old | \n",
+ " No | \n",
+ " No | \n",
+ " The survey was an appropriate length | \n",
+ " Somewhat easy | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 11 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " United States | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " No Degree | \n",
+ " Arts and Science | \n",
+ " 100 to 499 employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " I don't typically exercise | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Some college/university study without earning ... | \n",
+ " White or European descent | \n",
+ " 35 - 44 years old | \n",
+ " Yes | \n",
+ " No | \n",
+ " The survey was an appropriate length | \n",
+ " Very easy | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 20 | \n",
+ " No | \n",
+ " No | \n",
+ " India | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelors | \n",
+ " Engineering | \n",
+ " 20 to 99 employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " I don't typically exercise | \n",
+ " Non-conforming | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " South Asian | \n",
+ " 35 - 44 years old | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 21 | \n",
+ " No | \n",
+ " No | \n",
+ " Netherlands | \n",
+ " Yes, full-time | \n",
+ " Employed full-time | \n",
+ " No Degree | \n",
+ " No major | \n",
+ " 20 to 99 employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " Daily or almost every day | \n",
+ " Male | \n",
+ " NaN | \n",
+ " Associate degree | \n",
+ " White or European descent | \n",
+ " 18 - 24 years old | \n",
+ " No | \n",
+ " NaN | \n",
+ " The survey was an appropriate length | \n",
+ " Neither easy nor difficult | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 27 | \n",
+ " Yes | \n",
+ " No | \n",
+ " Sweden | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelors | \n",
+ " Business | \n",
+ " 10 to 19 employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " 3 - 4 times per week | \n",
+ " Male | \n",
+ " NaN | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " White or European descent | \n",
+ " 35 - 44 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was too long | \n",
+ " Somewhat difficult | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 29 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " India | \n",
+ " Yes, full-time | \n",
+ " Employed full-time | \n",
+ " Bachelors | \n",
+ " No major | \n",
+ " 10,000 or more employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " Daily or almost every day | \n",
+ " Female | \n",
+ " NaN | \n",
+ " Some college/university study without earning ... | \n",
+ " South Asian | \n",
+ " 35 - 44 years old | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " The survey was too long | \n",
+ " Very difficult | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 33 | \n",
+ " Yes | \n",
+ " Yes | \n",
+ " Australia | \n",
+ " No | \n",
+ " Employed full-time | \n",
+ " Bachelors | \n",
+ " Engineering | \n",
+ " 1,000 to 4,999 employees | \n",
+ " Developer | \n",
+ " ... | \n",
+ " 3 - 4 times per week | \n",
+ " Male | \n",
+ " Straight or heterosexual | \n",
+ " Bachelor’s degree (BA, BS, B.Eng., etc.) | \n",
+ " South Asian | \n",
+ " 35 - 44 years old | \n",
+ " Yes | \n",
+ " NaN | \n",
+ " The survey was too long | \n",
+ " Neither easy nor difficult | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 129 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "cleaned_2018"
+ }
+ },
+ "metadata": {},
+ "execution_count": 428
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#calculating percentage of missing values\n",
+ "missing_count = df.isnull().sum() #number of missing\n",
+ "total_cells = np.product(df.shape) # number of cells (cols x rows)\n",
+ "total_missing = missing_count.sum()\n",
+ "missing_percent = (total_missing*100)/total_cells"
+ ],
+ "metadata": {
+ "id": "2LyBn6_i-b3o"
+ },
+ "execution_count": 429,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print('Total : ', total_cells)\n",
+ "print('Total missing : ', total_missing)\n",
+ "print('Missing Percentage: ', missing_percent, '%')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PcVgSusH-5Sh",
+ "outputId": "85c10ea5-a55c-4be0-9786-9954791dd16a"
+ },
+ "execution_count": 430,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Total : 7611\n",
+ "Total missing : 1487\n",
+ "Missing Percentage: 19.537511496518196 %\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Hence, we reduced the missing data from 35% to 19% which is far way better. And the remaining null values could be present in the columns thatwe are not going to use in our analysis"
+ ],
+ "metadata": {
+ "id": "pwoj3j56-9nh"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Outlier detection**"
+ ],
+ "metadata": {
+ "id": "dJH6dMHm_rcr"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#detecting outliers using the interquartile range (IQR) method\n",
+ "def detect_outliers_iqr(dataframe, fields):\n",
+ " outliers_indices = []\n",
+ " for feature in fields:\n",
+ " # Calculate the first and third quartiles\n",
+ " Q1 = dataframe[feature].quantile(0.25)\n",
+ " Q3 = dataframe[feature].quantile(0.75)\n",
+ " # Calculating the interquartile range (IQR)(diff of remaining)\n",
+ " IQR = Q3 - Q1\n",
+ " # Defining the lower and upper bounds for outlier detection\n",
+ " lower_bound = Q1 - 1.5 * IQR\n",
+ " upper_bound = Q3 + 1.5 * IQR\n",
+ " # Finding outliers based on the lower and upper bounds\n",
+ " outliers = dataframe[(dataframe[feature] < lower_bound) | (dataframe[feature] > upper_bound)].index\n",
+ " # Adding the indices of outliers to the list to know whether they exist or not\n",
+ " outliers_indices.extend(outliers)\n",
+ " # Return indices of outliers\n",
+ " return list(set(outliers_indices))"
+ ],
+ "metadata": {
+ "id": "pheMWmjZ-7gh"
+ },
+ "execution_count": 431,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Calculating the outliers for different columns"
+ ],
+ "metadata": {
+ "id": "sWFblaMjCekz"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#outliers for Salaryusd\n",
+ "cols = [ 'SalaryUSD']\n"
+ ],
+ "metadata": {
+ "id": "ZxPFi0uHCaaz"
+ },
+ "execution_count": 432,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "outliers_indices_iqr = detect_outliers_iqr(df,cols )"
+ ],
+ "metadata": {
+ "id": "w8cRvgaTAQR1"
+ },
+ "execution_count": 433,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Printig the indices of outliers detected using the IQR method\n",
+ "print(\"Indices of outliers detected using IQR method:\", outliers_indices_iqr)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rpKuLqjgASsQ",
+ "outputId": "bd33a0d2-fead-4a2d-88f7-9f8bb5e3fde6"
+ },
+ "execution_count": 434,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Indices of outliers detected using IQR method: [8, 75, 27]\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Visualizing the outliers detected for 'Salaryusd' column with a boxplot\n",
+ "plt.figure(figsize=(8, 6))\n",
+ "sns.boxplot(data=df, x='SalaryUSD', orient='h')\n",
+ "plt.title('Horizontal Box Plot of SalaryUSD')\n",
+ "plt.xlabel('Salary (USD)')\n",
+ "plt.show()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 573
+ },
+ "id": "-AOrLBl_DXZv",
+ "outputId": "452d3792-e4a4-4738-8cd7-63ccf5e679bf"
+ },
+ "execution_count": 435,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "