The dataset comes from a tracking schema that we use for assessing user satisfaction. Desktop users are randomly sampled to be anonymously tracked by this schema which uses a “I’m alive” pinging system that we can use to estimate how long our users stay on the pages they visit. The dataset contains just a little more than a week of EL data.
https://github.com/wikimedia-research/Discovery-Hiring-Analyst-2016
# Import some libraries
library(ggplot2)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
# Import dataset CSV and explicitly cast each column type
events <- read.table(
file = "C:/Users/Eirika/Documents/Code/Datasets/events_log/events_log.csv",
sep = ",", header = TRUE,
colClasses = (
c(
"character",
"character",
"character",
"character",
"character",
"integer",
"character",
"integer",
"integer"
)
))
# Using ymd_hms and date, convert timestamp column
events$timestamp <- ymd_hms(events$timestamp)
## Warning: 4 failed to parse.
events$date <- date(events$timestamp)
Want to select all NA events that werent timestamped correctly and figure out what to do with them
# Troubleshooting
# Pull all records out with NA timestamp to a separate data frame to view
fails <- events[is.na(events$timestamp),]
# Remove fails from events df?
The daily clickthrough rate varies from 6654 clickthroughs on 3/1/16 to 3323 clickthroughs on 3/5/16
Group A makes up the bulk of the clickthrough events, whereas group B has far fewer.
# Pull out data only of visitPage clickthrough events
clickthrough <- events[events$action == 'visitPage',]
# Pull out clickthroughs that resulted from non-NA result position
clickthrough <- clickthrough[!is.na(clickthrough$result_position),]
# Chart by day
cpd <- ggplot(clickthrough, aes(x=clickthrough$date)) + geom_bar() + labs(title= "Overall",x="Date", y="Clickthroughs") + theme(axis.text.x=element_text(angle=90), axis.text.y=element_text(angle=0))
print(cpd)
# Chart per group a and b
groupa <- clickthrough[clickthrough$group == 'a',]
groupb <- clickthrough[clickthrough$group == 'b',]
cpda <- ggplot(groupa, aes(x=groupa$date)) + labs(title= "Group A",x="Date", y="Clickthroughs") + geom_bar() + theme(axis.text.x=element_text(angle=90), axis.text.y=element_text(angle=0))
cpdb <- ggplot(groupb, aes(x=groupb$date)) + labs(title= "Group B",x="Date", y="Clickthroughs") + geom_bar() + theme(axis.text.x=element_text(angle=90), axis.text.y=element_text(angle=0))
grid.arrange(cpd, cpda, cpdb, ncol=3)
# Overall daily clickthrough rate
#count(clickthrough$date)
# Group A clickthrough rate
#count(groupa$date)
# Group B clickthrough rate
#count(groupb$date)
# How large is each group?
#groupsize <- count(events$group)
#print(groupsize)
#Error in UseMethod("groups") : no applicable method for 'groups' applied to an object of class "character"
By far the most people try the first result first. On 3/1/16, there are a cluster of search results that were around position 4000. This was not the case on any other day. 3/4/16 had the largest variability in search result position.
# Pull out all data that have a results_position that is not null
results <- events[!(is.na(events$result_position)),]
# count(results$result_position) # for freq table
# Graph of distribution of results_position
rtf <- ggplot(results, aes(x=results$result_position)) + geom_bar() + coord_cartesian(xlim=c(0,10)) + labs(title = "Results Clicked First", x="Result Position", y= "Frequency")
print(rtf)
# Zoom in on only result positions in the top 500
results_500 <- results[results$result_position < 500,]
days_500 <- ggplot(results_500, aes(x=results_500$date, y=results_500$result_position, colour = results_500$result_position)) + geom_jitter() + labs(title = "Results Clicked First", x="Day", y= "Result Position", colour = "Result Position") + scale_colour_gradientn(colours=rainbow(4))
print(days_500)
## Warning: Removed 2 rows containing missing values (geom_point).
# Zoom in on only result positions in the top 200
results_200 <- results_500[results_500$result_position < 200,]
days_200 <- ggplot(results_200, aes(results_200$date, y=results_200$result_position, colour = results_200$result_position)) + geom_jitter() + labs(title = "Results Clicked First", x="Day", y= "Result Position", colour = "Result Position") + scale_colour_gradientn(colours=rainbow(4))
print(days_200)
## Warning: Removed 2 rows containing missing values (geom_point).
The daily overall zero results rate ranges between 10 and 15% each day. Group A is closer to the overall trend whereas Group b as a population displays less variability in zero results rate.
# Pull out only search result page action
search_results <- events[events$action == "searchResultPage",]
# Pull out only n_results of 0, exclude N/A
not_na_results <- search_results[!is.na(search_results$n_results),]
zero_results <- not_na_results[not_na_results$n_results == 0,]
# Pull out group b and group A data
zero_a <- zero_results[zero_results$group == "a",]
zero_b <- zero_results[zero_results$group == "b",]
# Plot by day
zrpd <- ggplot(zero_results) + geom_bar(mapping = aes(x=zero_results$date,y =..prop..), stat = "count", group = 1) + labs(title = "Frequency of Zero Results Per Day", x="Day", y= "Percentage of zero Results") + scale_y_continuous(labels = scales::percent_format())
# Plot by group
zra <- ggplot(zero_a) + geom_bar(mapping = aes(x=zero_a$date,y =..prop..), stat = "count", group = 1) + labs(title = "Frequency of Zero Results Per Day", x="Day", y= "Percentage of zero Results") + scale_y_continuous(labels = scales::percent_format())
zrb <- ggplot(zero_b) + geom_bar(mapping = aes(x=zero_b$date,y =..prop..), stat = "count", group = 1) + labs(title = "Frequency of Zero Results Per Day", x="Day", y= "Percentage of zero Results") + scale_y_continuous(labels = scales::percent_format())
print(zrpd)
print(zra)
print(zrb)
Session length between group a and b appears to be slightly different, with group b having a smaller median session length.
# Group by session id and generate a new column: len that is the difference between the earliest and latest timestamps in a session (in s)
# Remove lengths of 0
sessions <- events %>%
select(group,session_id,timestamp,date) %>%
group_by(group,session_id) %>%
summarise(min=min(timestamp),max=max(timestamp)) %>%
mutate(len = max - min) %>%
filter(len!=0)
# Box plot comparing groups a and b session length variable
# Remove lengths greater than the 95th percentile
box <- ggplot(sessions %>% filter(len < quantile(sessions$len, 0.95)), aes(factor(group), len)) + geom_boxplot(aes(fill = factor(group)), width=0.5, outlier.colour = "dodgerblue", outlier.size = 1, outlier.shape = 16, outlier.stroke = 1) + labs(title="Session length in group a vs group b", x="Group", y="Session Length (s)")
print(box)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
# Box plot comparing days to session length, separated by group