softeerbootcamp4th · jang-namu · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,112 @@
+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,pycharm+all,python,venv,jupyternotebooks
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,pycharm+all,python,venv,jupyternotebooks
+*.csv
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+share/
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm+all Patch ###
+# Ignore everything but code style settings and run configurations
+# that are supposed to be shared within teams.
+
+.idea/*
+
+!.idea/codeStyles
+!.idea/runConfigurations
+
+### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -76,11 +185,8 @@ docs/_build/
 target/
 
 # Jupyter Notebook
-.ipynb_checkpoints
 
 # IPython
-profile_default/
-ipython_config.py
 
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
@@ -158,3 +264,45 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### venv ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+pyvenv.cfg
+pip-selfcheck.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,pycharm+all,python,venv,jupyternotebooks
diff --git a/missions/W1/README.md b/missions/W1/README.md
@@ -0,0 +1,101 @@
+# Week1. 환경 설정
+
+## 목차
+1. [환경설정](#1-환경설정)
+   - [pyenv 설치](#pyenv-설치)
+   - [venv](#venv)
+   - [jupyter lab, notebook 설치](#jupyter-lab-notebook-설치)
+   - [jupyter notebook, venv에서 실행하기](#jupyter-notebook-venv에서-실행하기)
+   - [부록](#부록)
+   - [참고](#참고)
+
+## 1. 환경설정
+
+### pyenv 설치
+>pyenv는 여러 python 버전을 한 컴퓨터에서 관리 및 사용할 수 있게 해준다.  
+설치에는 다음의 링크를 참고하자.  
+[GitHub-pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation)  
+
+어차피 구체적인 환경은 venv를 사용할 예정이므로, pyenv를 통해 설치하는 python은 Global로 설정해도 된다.  
+```bash
+pyenv global 3.12 # [email protected]를 global하게 사용(설치)
+```  
+<br>
+
+### venv
+>venv는 가상환경을 제공해서 워킹디렉토리 별로 python 버전 관리 및 의존성을 따로 관리할 수 있도록 해준다.  
+
+<br>
+
+### jupyter lab, notebook 설치
+[Jupyter Install](https://jupyter.org/install)를 참고하여 주피터 랩과 노트북을 설치한다.  
+<br>
+
+### jupyter notebook, venv에서 실행하기
+주피터를 이대로 실행하게 되면 Global로 설정된 pyenv를 사용하고 후에 설치하게 될 모든 의존성도 그곳에 모이게된다.  
+이렇게 될 경우 venv의 이점을 다 누리지 못하므로 우리는 pyenv 대신 venv를 통해 주피터를 사용할 수 있게 설정한다.  
+<br>
+>pyenv(global)가 아닌 venv(local)에 패키지 설치하고 사용하기
+파이썬이 어디서 실행되는지 알려면 간단하게 which를 사용해볼 수 있다.  
+```
+which python
+```  
+![figure-1](assets/figure-1.png)  
+현재 global로 설정된 pyenv의 [email protected]를 찾아오고 있다.  
+우리는 이를 venv로 교체한다.  
+<br>
+W1 폴더에 venv를 만들어준다.  
+```bash
+python -m venv <working-directory>
+```
+![figure-2](assets/figure-2.png)  
+<br>
+가상환경을 활성화한다.  
+```bash
+source <working-directory>/bin/activate
+```
+![figure-3](assets/figure-3.png)  
+<br>
+이제 주피터 노트북과 venv를 연결해줘야 한다.  
+ipykernel을 설치하자.   
+- ipykernel도 가상환경 밑에 설치되도록, activate 이후 설치한다.  
+```bash
+pip install ipykernel
+```
+<br>
+
+커널을 하나 만들어준다. 추가할 가상환경과 주피터에서 display할 이름을 정의한다.  
+```bash
+python -m ipykernel install --user --name [가상환경폴더] --display-name [Jupyter에서 보여질 이름]
+# python -m ipykernel install --user --name W1 --display-name W1-venv
+```
+
+<br>
+
+주피터 실행 후 ipynb 파일을 생성하고 커널을 변경한다.  
+![figure-4](assets/figure-4.png)  
+
+<br>
+
+새로운 패키지를 설치해보고 올바른 위치(venv)에 깔리는지 확인하자.  
+![figure-5](assets/figure-5.png)    
+
+<br>
+
+#### 부록.
+현재 존재하는 커널 리스트를 확인하고 삭제하는 방법  
+```bash
+# List all kernels and grap the name of the kernel you want to remove
+jupyter kernelspec list
+# Remove it
+jupyter kernelspec remove <kernel_name>
+```  
+
+<br>
+
+##### 참고.
+[JupyterLab 에 가상환경(Virtualenv) 연결, 삭제하기
+](https://raki-1203.github.io/jupyter/JupyterLab_venv_add_delete/)
+
+
+
diff --git a/missions/W1/assets/figure-1.png b/missions/W1/assets/figure-1.png
diff --git a/missions/W1/assets/figure-2.png b/missions/W1/assets/figure-2.png
diff --git a/missions/W1/assets/figure-3.png b/missions/W1/assets/figure-3.png
diff --git a/missions/W1/assets/figure-4.png b/missions/W1/assets/figure-4.png
diff --git a/missions/W1/assets/figure-5.png b/missions/W1/assets/figure-5.png
diff --git a/missions/W1/mtcars.csv b/missions/W1/mtcars.csv
diff --git a/missions/W4/spark-analysis/Dockerfile b/missions/W4/spark-analysis/Dockerfile
@@ -0,0 +1,78 @@
+FROM ubuntu:22.04
+
+ENV HADOOP_VERSION=3.4.0
+ENV HADOOP_HOME=/opt/hadoop
+ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+# Set Hadoop User
+ENV HDFS_NAMENODE_USER=hdfs
+ENV HDFS_DATANODE_USER=hdfs
+ENV HDFS_SECONDARYNAMENODE_USER=hdfs
+ENV YARN_RESOURCEMANAGER_USER=yarn
+ENV YARN_NODEMANAGER_USER=yarn
+# Set Spark env.
+ENV SPARK_VERSION=3.5.1
+ENV SPARK_HOME=/opt/spark
+ENV SPARK_CONFG_DIR=$SPARK_HOME/conf
+ENV MASTER=spark://hadoop-master:7077
+
+ENV PDSH_RCMD_TYPE=ssh
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-arm64
+ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
+
+
+RUN apt-get update && \
+    apt-get install -y openjdk-11-jdk python3 python3-pip ssh pdsh wget sudo && \
+    apt-get clean
+
+# Download and extract Hadoop to /opt
+RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P /opt && \
+    tar -xzvf /opt/hadoop-$HADOOP_VERSION.tar.gz -C /opt && \
+    mv /opt/hadoop-$HADOOP_VERSION $HADOOP_HOME && \
+    rm /opt/hadoop-$HADOOP_VERSION.tar.gz
+
+# Download and extract Hadoop to /opt
+RUN wget http://apache.mirror.cdnetworks.com/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz -P /opt && \
+    tar -xzvf /opt/spark-$SPARK_VERSION-bin-hadoop3.tgz -C /opt && \
+    mv /opt/spark-$SPARK_VERSION-bin-hadoop3 $SPARK_HOME && \
+    rm /opt/spark-$SPARK_VERSION-bin-hadoop3.tgz
+
+RUN pip3 install pyspark
+
+# 사용자 및 그룹 생성 및 권한 설정
+RUN groupadd -g 1000 hadoop && \
+    useradd -m -u 1001 -g 1000 hdfs && \
+    useradd -m -u 1002 -g 1000 yarn && \
+    useradd -m -u 1003 -g 1000 spark && \
+    echo "hdfs ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
+    echo "yarn ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
+    echo "spark ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+USER hdfs
+RUN ssh-keygen -t rsa -N '' -f ~/.ssh/id_rsa && \
+    mkdir -p ~/.ssh && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
+    chmod 0600 ~/.ssh/authorized_keys
+
+USER yarn
+RUN ssh-keygen -t rsa -N '' -f ~/.ssh/id_rsa && \
+    mkdir -p ~/.ssh && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
+    chmod 0600 ~/.ssh/authorized_keys
+
+USER spark
+RUN ssh-keygen -t rsa -N '' -f ~/.ssh/id_rsa && \
+    mkdir -p ~/.ssh && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
+    chmod 0600 ~/.ssh/authorized_keys
+
+USER root
+RUN mkdir -p $HADOOP_HOME/logs && mkdir -p $HADOOP_HOME/data && \
+    chown -R :hadoop $HADOOP_HOME/ && \
+    chown -R hdfs:hadoop $HADOOP_HOME/data && \
+    chmod -R 775 $HADOOP_HOME/
+
+RUN chown -R spark:hadoop $SPARK_HOME
+
+RUN echo "export JAVA_HOME="$(jrunscript -e 'java.lang.System.out.println(java.lang.System.getProperty("java.home"));')"" >> $HADOOP_CONF_DIR/hadoop-env.sh
+
+ENTRYPOINT ["/usr/local/bin/start.sh"]
diff --git a/missions/W4/spark-analysis/README.md b/missions/W4/spark-analysis/README.md
@@ -0,0 +1 @@
+# Start