040_clustering_software_with_profiles.Rmd

---
title: "3_Using_Latent_Profiles_for_Software_Aggregation.Rmd"
mainfont: DejaVu Sans
output:
pdf_document:
latex_engine: xelatex
keep_tex: true
html_document: default
word_document: default
font-family: Times New Roman
---


```{r}


source('010_data_preprocessing.R')

library(tidyverse)
library(magrittr)
library(tidyLPA)

library(mclust)
library(forcats)
library(hrbrthemes)

library(knitr)
library(kableExtra)

library(graphics)
library(PMCMR)

library(gridExtra)

source("./functions.R")

```


Extracting profile and prob associated with the person

```{r}

profiles = m3 %>%
  dplyr::select(RecordNo, profile, posterior_prob)

```


```{r}


jobtitleanddescription = df_clean[, c("RecordNo", "Q18_1_open", "Q18a", "Occupation_DISCO", "Occupation")]

merged = merge(x = software_cleaned, y = jobtitleanddescription, by= "RecordNo") %>%
  select(-a, -Q18_1_open, -Q18a, -Occupation_DISCO)
merged = merged[!is.na(merged$a2),] %>% # removing NA's from the a column
  dplyr::rename(a = a2)

merged$RecordNo %<>% as.numeric()
profiles$RecordNo %<>% as.numeric()

merged %>% View()

```

TODO: make the same trick with education instead/with profession

```{r}


education_df = df_clean[, c("RecordNo", "profile_education")]
education_df$RecordNo %<>% as.numeric()

education_software = inner_join(software_cleaned, education_df, by= "RecordNo")

education_software = education_software[!is.na(education_software$a2),] %>% # removing NA's from the a column
  dplyr::select(-a) %>%
  dplyr::rename(a = a2)

education_software %>% View


```

Checking for NAs

```{r}

anti_join(merged, profiles, by = "RecordNo")
merged_with_profiles = inner_join(merged, profiles, by = "RecordNo")

merged_with_profiles %>% View

```


Summing probabilities from respondents to software
UPD: Doesn't seem to be promising

Finally, occupation + profiles per software for mapping

```{r}


profiling_dist = merged_with_profiles %>%
  dplyr::select(profile, a) %>% 
  gather(key, val, profile) %>%  # dplyr::mutate(val_weighted = posterior_prob*as.numeric(val)) %>% 
  group_by(a, val)  %>% 
  tally() %>% 
  spread(val, n, fill = 0)  # %>% View()


# checking for merging
# anti_join(pre_dist, profiling_dist, by = "a")
occupation_profiling_dist = inner_join(pre_dist, profiling_dist, by = "a")

# Normalizing Frequencies

occupation_profiling_dist[,-1] = apply(X = occupation_profiling_dist[,-1],
                            2,
                            FUN = function(x) scale(x, center = TRUE, scale = TRUE))


occupation_profiling_dist %>% View

```

# occupation + profiles per software for mapping
## Only 200 most popular software items

```{r}


profiling_dist_200 = merged_with_profiles %>%
  dplyr::filter(a %in% software_aggregated$a2[1:200]) %>% 
  dplyr::select(profile, a) %>% 
  gather(key, val, profile) %>%  # dplyr::mutate(val_weighted = posterior_prob*as.numeric(val)) %>% 
  group_by(a, val)  %>% 
  tally() %>% 
  spread(val, n, fill = 0)  # %>% View()


# checking for merging
# anti_join(pre_dist, profiling_dist, by = "a")
occupation_profiling_dist_200 = inner_join(pre_dist, profiling_dist_200, by = "a")

# Normalizing Frequencies

occupation_profiling_dist_200[,-1] = apply(X = occupation_profiling_dist[,-1],
                            2,
                            FUN = function(x) scale(x, center = TRUE, scale = TRUE))


occupation_profiling_dist %>% View

```


WAITING: Function to got predefined number of software labels for each cluster

```{r}


# based on number of appearences defined in software_aggregated df
# common case 
# labels_to_select = occupation_profiling_dist$a[occupation_profiling_dist$a %in% software_aggregated$a2[1:60]]


occupation_profiling_dist$cluster = distance_clustering(occupation_profiling_dist, "canberra", 6)
for_labels = inner_join(software_aggregated %>% dplyr::select(a = a2, n), occupation_profiling_dist, by = "a")


# occupation_profiling_dist %>% View


labels_to_select = for_labels %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = n)


```


```{r}


occupation_profiling_dist_200$cluster = distance_clustering(occupation_profiling_dist_200, "canberra", 6)
for_labels_200 = inner_join(software_aggregated %>% dplyr::select(a = a2, n), occupation_profiling_dist_200, by = "a")


# occupation_profiling_dist %>% View


labels_to_select_200 = for_labels_200 %>%
  group_by(cluster) %>%
  top_n(n = 10, wt = n)


labels_to_select_200 %>% View

```


# Mapping all the software items

```{r}

# "euclidean" 'maximum' 'manhattan' 'minkowski' 'canberra' 'binary'

with(dev.new(), mds_n_plot(occupation_profiling_dist, "canberra", labels_to_select, 6))
with(dev.new(), mds_n_plot(occupation_profiling_dist, "euclidean", labels_to_select, 6))
with(dev.new(), mds_n_plot(occupation_profiling_dist, "manhattan", labels_to_select, 6))
with(dev.new(), mds_n_plot(occupation_profiling_dist, "minkowski", labels_to_select, 6))
with(dev.new(), mds_n_plot(occupation_profiling_dist, "maximum", labels_to_select, 6))


```


# Mapping only top-200


```{r}


## using only profiles
with(dev.new(), plot_mds(occupation_profiling_dist_200 %>% ungroup() %>%  dplyr::select(a, groups, '1', '2', '3', '4','5', '6', cluster), "canberra", labels_to_select_200, 6))


```