-
Notifications
You must be signed in to change notification settings - Fork 0
/
A_building_subreddit_month_df.Rmd
161 lines (133 loc) · 5.62 KB
/
A_building_subreddit_month_df.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
---
title: "Building the subreddt_month database"
author: "Emilio Robleda"
date: "`r Sys.Date()`"
output: html_document
---
### Load libraries and data
```{r Load Libraries, message=FALSE, warning=FALSE}
library(tidyverse)
library(lubridate)
library(tibble)
library(zoo)
library(readr)
```
```{r load data}
# Unzip folder and load data
zip_folder <- "data.zip"
file_to_extract <- "data.csv"
destination_path <- "./" # Specify the desired destination path
unzip(zip_folder, files = file_to_extract, exdir = destination_path)
data <- read_csv("data.csv")
```
### Basic descriptive stats
Structure of the raw data
```{r display structure}
str(data)
```
### Data manipulation
Change the format of date variables
```{r dates in format}
# Create a new variable called "comment_date"
data$comment_date <- as.Date(data$date, format = "%m/%d/%Y", na.rm = TRUE)
data$comment_date[is.na(data$comment_date)] <- as.Date(data$date[is.na(data$comment_date)])
# Convert "post_date" variable to date format (DD/MM/YYYY)
data$posts_date <- as.Date(data$post_date, format = "%m/%d/%Y", na.rm = TRUE)
data$posts_date[is.na(data$posts_date)] <- as.Date(data$post_date[is.na(data$posts_date)])
```
Include new variables
```{r new variables}
data <- data %>%
mutate(
# Dummy to see if comment was deleted
deleted = ifelse(comment == "[deleted]", 1, 0),
# Dummy to see if comment was deleted
removed = ifelse(comment == "[removed]", 1, 0),
# Dummy to see if comment was deleted or removed
deleted_removed = ifelse(deleted == 1 | removed == 1, 1, 0),
# Dummy to see if the AutoMod was the author of a comment
automod = ifelse(author == "AutoModerator", 1, 0),
# Dummy to see if the post was submit during the Euro 2022 (for this research, in July 2022)
during_euros = ifelse(posts_date >= as.Date("2022-07-01") & posts_date < as.Date("2022-08-01"), 1, 0),
# Dummy to see if the post was submitted after the Euro 2022 (Aug-2022 onwards)
after_euros = ifelse(posts_date >= as.Date("2022-08-01"), 1, 0),
# Treatment group is one of those four subreddits (their teams had a player in England's Euro 2022 squad)
treatment = ifelse(subreddit %in% c("reddevils", "chelseafc", "Gunners", "MCFC"), 1, 0),
# Have a dummy of only one observation per unique post (to calculate avg_post_score later)
first_comment = ifelse(duplicated(post_timestamp), 0, 1),
# Calculate the length in terms of characters (NOT TOKENS) per comment
comment_chars = ifelse(deleted == 1 | removed == 1 | automod == 1, NA, nchar(comment)),
# Create a month
month_year = paste(year(posts_date), month(posts_date), sep = "-"),
# Create a subreddit_month character variable
subreddit_month = paste(subreddit, month_year, sep = "_"))
data <- data %>%
# Group by unique posts
group_by(post_timestamp) %>%
# Create a dummy to know whether the post's thread had at least one deleted or removed comment
mutate(moderated_post = ifelse(first_comment == 1 & any(deleted_removed == 1), 1, 0)) %>%
# Ungroup data
ungroup()
```
### Building a subreddit_month panel structure of the data
```{r subreddit_month panel}
subreddit_stats <- data %>%
# Filter observations where the comment was from the AutoModerator
filter(automod != 1) %>%
# Group by subreddit
group_by(subreddit_month) %>%
summarize(
# Group dummy; g==1 means treatment group
g = ifelse(any(treatment == 1), 1, 0),
# Time dummy; t==1 meant post_treatment (JULY 2022 is neither pre nor post treatment)
t = ifelse(any(after_euros ==1), 1, 0),
# Treatment dummy; z==1 means post_treatment observations for treatment group units
z = ifelse(any(g == 1 & t == 1), 1, 0),
# Total number of posts
posts = n_distinct(post_timestamp),
# Total number of comments
comments = n(),
# Mean comments per post
avg_com = round(comments/posts,1),
# Mean score per comment
avg_score = round(mean(score),1),
# Number of characters per comment
avg_chars = round(mean(comment_chars, na.rm = T),1),
# Mean post_score per distinct post
avg_post_score = round(mean(post_score[first_comment == 1]),1),
# Counter of moderated posts (at least 1 deleted or removed comment in thread)
moderated_posts = sum(moderated_post==1, na.rm = TRUE),
# Proportion of posts that included moderation
mod_prop = round(moderated_posts/posts,3),
# Total number of deleted comments
del_com = sum(deleted == 1, na.rm = TRUE),
# Total number of removed comments
rem_com = sum(removed == 1, na.rm = TRUE),
# Proportion of deleted comments
del_prop = round(del_com/comments,3),
# Proportion of removed comments
rem_prop = round(rem_com/comments,3)) %>%
mutate(
# Extract subreddit
subreddit = sub("(.*)_.*", "\\1", subreddit_month),
# Extract month
month = sub(".*_(.*)", "\\1", subreddit_month)) %>%
# Convert to data frame
as.data.frame()
# Put month in "Mon-2020" format
subreddit_stats$month <- as.yearmon(subreddit_stats$month, format = "%Y-%m")
# Select variables in desired order and arrange by subreddit and month
subreddit_stats <- subreddit_stats %>%
select(subreddit, month, g, t, z, posts, comments, avg_com, avg_score, avg_post_score,
avg_chars, moderated_posts, mod_prop, del_com, rem_com, del_prop, rem_prop) %>%
arrange(month) %>%
arrange(subreddit)
```
### Save data as .csv file
```{r view data and save as csv}
# Show stats
write.csv(subreddit_stats, file = "subreddit_stats.csv", row.names = T)
```
```{r}
# See context of Data
```