suppressWarnings(library(dplyr))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
suppressWarnings(library(caret)) # dummyVars
## Loading required package: lattice
## Loading required package: ggplot2
downloads <- read.table(file = "../files/downloads.csv.gz", header=TRUE, sep=";", quote = "", as.is = T, fill=TRUE)
data <- select(downloads, mean.price, confidence)
new.data <- data.frame(predict(dummyVars("~ .", data), newdata = data))
c<-cor(new.data, use = "pairwise.complete.obs")
The %>%
operator.
t <- as_tibble(downloads)
t <- downloads %>% as_tibble
select(t, mean.price, downloads, confidence)
## # A tibble: 1,156,153 x 3
## mean.price downloads confidence
## <dbl> <int> <chr>
## 1 0.89 3739 average
## 2 0.89 3738 average
## 3 1.09 4578 average
## 4 0.79 3318 average
## 5 0.89 3738 average
## 6 0.89 3738 average
## 7 0.79 3318 average
## 8 0.69 2898 average
## 9 0.89 5290 good
## 10 0.99 5710 good
## # ... with 1,156,143 more rows
t %>% select(mean.price, downloads, confidence)
## # A tibble: 1,156,153 x 3
## mean.price downloads confidence
## <dbl> <int> <chr>
## 1 0.89 3739 average
## 2 0.89 3738 average
## 3 1.09 4578 average
## 4 0.79 3318 average
## 5 0.89 3738 average
## 6 0.89 3738 average
## 7 0.79 3318 average
## 8 0.69 2898 average
## 9 0.89 5290 good
## 10 0.99 5710 good
## # ... with 1,156,143 more rows
t %>%
select(confidence) %>%
distinct %>%
pull(confidence)
## [1] "average" "good" "very good" "excellent" "poor" "terrible"
tt <- t %>% mutate(confidence.numeric =
ifelse(confidence=="terrible", 0,
ifelse(confidence=="poor", 1,
ifelse(confidence=="average", 2,
ifelse(confidence=="good", 3,
ifelse(confidence=="very good", 4,
ifelse(confidence=="excellent", 5, NA)))))))
tt
## # A tibble: 1,156,153 x 6
## artist title mean.price downloads confidence
## <chr> <chr> <dbl> <int> <chr>
## 1 0.89 3739 average
## 2 0010110000010011 Wheels 0.89 3738 average
## 3 004Santogold LES Artistes 1.09 4578 average
## 4 006 Like What, Me Worry 0.79 3318 average
## 5 009 Sound System Born To Be Wasted 0.89 3738 average
## 6 009 Sound System Space And Time 0.89 3738 average
## 7 00Genesis Dream Catcher 0.79 3318 average
## 8 00Genesis Ferris Wheel 0.69 2898 average
## 9 0131 Corpo E Mente 0.89 5290 good
## 10 0131 Diamente 0.99 5710 good
## # ... with 1,156,143 more rows, and 1 more variables:
## # confidence.numeric <dbl>
excellent <- tt %>%
filter(!is.na(mean.price) & artist != "" & title != "") %>%
filter(confidence.numeric > 3)
excellent
## # A tibble: 520,135 x 6
## artist title mean.price downloads confidence
## <chr> <chr> <dbl> <int> <chr>
## 1 0131 Il Danno 1.19 6646 very good
## 2 091 Un Día De Lluvia 0.69 5219 excellent
## 3 10000 Maniacs A Campfire Song 0.89 6548 excellent
## 4 10000 Maniacs Across The Fields 1.09 7310 excellent
## 5 10000 Maniacs All That Never Happens 0.79 6014 excellent
## 6 10000 Maniacs Among The Americans 1.09 7325 excellent
## 7 10000 Maniacs Anthem For Doomed Youth 0.59 5109 very good
## 8 10000 Maniacs Arbor Day 1.09 7298 excellent
## 9 10000 Maniacs A Room For Everything 0.79 6027 excellent
## 10 10000 Maniacs Back O' The Moon 1.29 8165 excellent
## # ... with 520,125 more rows, and 1 more variables:
## # confidence.numeric <dbl>
excellent %>% head(5) # tail()
## # A tibble: 5 x 6
## artist title mean.price downloads confidence
## <chr> <chr> <dbl> <int> <chr>
## 1 0131 Il Danno 1.19 6646 very good
## 2 091 Un Día De Lluvia 0.69 5219 excellent
## 3 10000 Maniacs A Campfire Song 0.89 6548 excellent
## 4 10000 Maniacs Across The Fields 1.09 7310 excellent
## 5 10000 Maniacs All That Never Happens 0.79 6014 excellent
## # ... with 1 more variables: confidence.numeric <dbl>
excellent %>% arrange(desc(mean.price)) %>% head(5)
## # A tibble: 5 x 6
## artist title mean.price
## <chr> <chr> <dbl>
## 1 Patrick Andy You Step 2.39
## 2 Poster Children If You See Kay 2.39
## 3 Smokey RobinsonThe Miracles I'll Try Something New 2.39
## 4 Sons Of The Pioneers The Everlasting Hills Of Oklahoma 2.39
## 5 Us3 India 2.39
## # ... with 3 more variables: downloads <int>, confidence <chr>,
## # confidence.numeric <dbl>
excellent %>% sample_n(10)
## # A tibble: 10 x 6
## artist title mean.price
## <chr> <chr> <dbl>
## 1 David Ford Go To Hell 0.79
## 2 Sohodolls Stripper 0.89
## 3 Eric Burdon The Dream 0.79
## 4 Kevin Toney Satin Doll 0.79
## 5 Hope Partlow Everywhere But Here 0.89
## 6 Sham 69 Hurry Up Harry 0.89
## 7 Master Margherita Sogno 1.09
## 8 Dina Carroll Falling 0.99
## 9 Eden's Bridge Here Is Love 0.99
## 10 Christian McBride Inside Straight Uncle James 0.89
## # ... with 3 more variables: downloads <int>, confidence <chr>,
## # confidence.numeric <dbl>
excellent %>% sample_frac(0.00001)
## # A tibble: 5 x 6
## artist title mean.price downloads confidence
## <chr> <chr> <dbl> <int> <chr>
## 1 Whiskey Rebels Summertime 1.89 9819 excellent
## 2 Janez Detd Killing Me 1.19 8396 excellent
## 3 Inspiral Carpets Lovegrove 0.79 5540 excellent
## 4 Diorama A DIFFERENT LIFE 0.89 6073 very good
## 5 Max Stalling Scars and Souvenirs 0.99 6017 excellent
## # ... with 1 more variables: confidence.numeric <dbl>
random_data <-
excellent %>%
select(artist) %>%
distinct %>%
mutate(random_value = rnorm(mean = 100, sd = 50, n = n()))
random_data
## # A tibble: 49,422 x 2
## artist random_value
## <chr> <dbl>
## 1 0131 47.57599
## 2 091 138.82757
## 3 10000 Maniacs 152.29532
## 4 1000 Homo DJs 122.23192
## 5 1000 Homo DJ's 116.61590
## 6 1000 Mexicans 98.71083
## 7 1000names 76.09348
## 8 1000 Names 125.26632
## 9 1000 Robota 150.68128
## 10 1001 58.73839
## # ... with 49,412 more rows
ggplot(random_data, aes(x=random_value)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
joined <- inner_join(excellent, random_data, by="artist")
joined <- left_join(excellent, random_data, by="artist")
joined
## # A tibble: 520,135 x 7
## artist title mean.price downloads confidence
## <chr> <chr> <dbl> <int> <chr>
## 1 0131 Il Danno 1.19 6646 very good
## 2 091 Un Día De Lluvia 0.69 5219 excellent
## 3 10000 Maniacs A Campfire Song 0.89 6548 excellent
## 4 10000 Maniacs Across The Fields 1.09 7310 excellent
## 5 10000 Maniacs All That Never Happens 0.79 6014 excellent
## 6 10000 Maniacs Among The Americans 1.09 7325 excellent
## 7 10000 Maniacs Anthem For Doomed Youth 0.59 5109 very good
## 8 10000 Maniacs Arbor Day 1.09 7298 excellent
## 9 10000 Maniacs A Room For Everything 0.79 6027 excellent
## 10 10000 Maniacs Back O' The Moon 1.29 8165 excellent
## # ... with 520,125 more rows, and 2 more variables:
## # confidence.numeric <dbl>, random_value <dbl>
ggplot(joined, aes(x=random_value)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
c <- joined %>% group_by(artist) %>% count
c
## # A tibble: 49,422 x 2
## # Groups: artist [49,422]
## artist n
## <chr> <int>
## 1 0131 1
## 2 091 1
## 3 10000 Maniacs 61
## 4 1000 Homo DJs 3
## 5 1000 Homo DJ's 1
## 6 1000 Mexicans 8
## 7 1000names 36
## 8 1000 Names 1
## 9 1000 Robota 5
## 10 1001 1
## # ... with 49,412 more rows
ggplot(c, aes(x=n)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
joined %>% group_by(artist) %>%
summarise(
median.price = median(mean.price),
total.downloads = sum(downloads),
mean.downloads = mean(downloads),
mean.confidence = mean(confidence.numeric),
confidence_list = list(confidence))
## # A tibble: 49,422 x 6
## artist median.price total.downloads mean.downloads
## <chr> <dbl> <int> <dbl>
## 1 0131 1.19 6646 6646.000
## 2 091 0.69 5219 5219.000
## 3 10000 Maniacs 0.89 412593 6763.820
## 4 1000 Homo DJs 1.09 21469 7156.333
## 5 1000 Homo DJ's 0.69 5065 5065.000
## 6 1000 Mexicans 0.94 43736 5467.000
## 7 1000names 0.89 209537 5820.472
## 8 1000 Names 1.39 7808 7808.000
## 9 1000 Robota 0.99 33338 6667.600
## 10 1001 0.99 7092 7092.000
## # ... with 49,412 more rows, and 2 more variables: mean.confidence <dbl>,
## # confidence_list <list>
library(SparkR)
sparkR.session()
sample <- downloads %>% sample_n(1000) %>% as.data.frame
df <- createDataFrame(sample, schema=NULL)
df %>% head(5) %>% as_tibble
df %>% select(artist) %>% head(5)
df %>% filter(df$artist == "Madonna") %>% head(5)
df %>% group_by(df$artist) %>% summarize(mean_price = median(df$mean_price)) %>% collect