suppressWarnings(library(dplyr))
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
suppressWarnings(library(caret)) # dummyVars
## Loading required package: lattice
## Loading required package: ggplot2

Correlation

downloads <- read.table(file = "../files/downloads.csv.gz", header=TRUE, sep=";", quote = "", as.is = T, fill=TRUE)
data <- select(downloads, mean.price,  confidence)
new.data <- data.frame(predict(dummyVars("~ .", data), newdata = data))
c<-cor(new.data, use = "pairwise.complete.obs")

dplyr

The %>% operator.

t <- as_tibble(downloads)
t <- downloads %>% as_tibble

select(t, mean.price, downloads, confidence)
## # A tibble: 1,156,153 x 3
##    mean.price downloads confidence
##         <dbl>     <int>      <chr>
##  1       0.89      3739    average
##  2       0.89      3738    average
##  3       1.09      4578    average
##  4       0.79      3318    average
##  5       0.89      3738    average
##  6       0.89      3738    average
##  7       0.79      3318    average
##  8       0.69      2898    average
##  9       0.89      5290       good
## 10       0.99      5710       good
## # ... with 1,156,143 more rows
t %>% select(mean.price, downloads, confidence)
## # A tibble: 1,156,153 x 3
##    mean.price downloads confidence
##         <dbl>     <int>      <chr>
##  1       0.89      3739    average
##  2       0.89      3738    average
##  3       1.09      4578    average
##  4       0.79      3318    average
##  5       0.89      3738    average
##  6       0.89      3738    average
##  7       0.79      3318    average
##  8       0.69      2898    average
##  9       0.89      5290       good
## 10       0.99      5710       good
## # ... with 1,156,143 more rows

Selection

t %>% 
  select(confidence) %>%
  distinct %>%
  pull(confidence)
## [1] "average"   "good"      "very good" "excellent" "poor"      "terrible"

Modification

tt <- t %>% mutate(confidence.numeric = 
                   ifelse(confidence=="terrible", 0, 
                          ifelse(confidence=="poor", 1,
                                 ifelse(confidence=="average", 2, 
                                        ifelse(confidence=="good", 3, 
                                               ifelse(confidence=="very good", 4,
                                                      ifelse(confidence=="excellent", 5, NA)))))))
tt
## # A tibble: 1,156,153 x 6
##              artist               title mean.price downloads confidence
##               <chr>               <chr>      <dbl>     <int>      <chr>
##  1                                            0.89      3739    average
##  2 0010110000010011              Wheels       0.89      3738    average
##  3     004Santogold        LES Artistes       1.09      4578    average
##  4              006 Like What, Me Worry       0.79      3318    average
##  5 009 Sound System   Born To Be Wasted       0.89      3738    average
##  6 009 Sound System      Space And Time       0.89      3738    average
##  7        00Genesis       Dream Catcher       0.79      3318    average
##  8        00Genesis        Ferris Wheel       0.69      2898    average
##  9             0131       Corpo E Mente       0.89      5290       good
## 10             0131            Diamente       0.99      5710       good
## # ... with 1,156,143 more rows, and 1 more variables:
## #   confidence.numeric <dbl>

Filters

excellent <- tt %>% 
  filter(!is.na(mean.price) & artist != "" & title != "") %>% 
  filter(confidence.numeric > 3)
excellent
## # A tibble: 520,135 x 6
##           artist                   title mean.price downloads confidence
##            <chr>                   <chr>      <dbl>     <int>      <chr>
##  1          0131                Il Danno       1.19      6646  very good
##  2           091        Un Día De Lluvia       0.69      5219  excellent
##  3 10000 Maniacs         A Campfire Song       0.89      6548  excellent
##  4 10000 Maniacs       Across The Fields       1.09      7310  excellent
##  5 10000 Maniacs  All That Never Happens       0.79      6014  excellent
##  6 10000 Maniacs     Among The Americans       1.09      7325  excellent
##  7 10000 Maniacs Anthem For Doomed Youth       0.59      5109  very good
##  8 10000 Maniacs               Arbor Day       1.09      7298  excellent
##  9 10000 Maniacs   A Room For Everything       0.79      6027  excellent
## 10 10000 Maniacs        Back O' The Moon       1.29      8165  excellent
## # ... with 520,125 more rows, and 1 more variables:
## #   confidence.numeric <dbl>

Truncating

excellent %>% head(5) # tail()
## # A tibble: 5 x 6
##          artist                  title mean.price downloads confidence
##           <chr>                  <chr>      <dbl>     <int>      <chr>
## 1          0131               Il Danno       1.19      6646  very good
## 2           091       Un Día De Lluvia       0.69      5219  excellent
## 3 10000 Maniacs        A Campfire Song       0.89      6548  excellent
## 4 10000 Maniacs      Across The Fields       1.09      7310  excellent
## 5 10000 Maniacs All That Never Happens       0.79      6014  excellent
## # ... with 1 more variables: confidence.numeric <dbl>
excellent %>% arrange(desc(mean.price)) %>% head(5)
## # A tibble: 5 x 6
##                        artist                             title mean.price
##                         <chr>                             <chr>      <dbl>
## 1                Patrick Andy                          You Step       2.39
## 2             Poster Children                    If You See Kay       2.39
## 3 Smokey RobinsonThe Miracles            I'll Try Something New       2.39
## 4        Sons Of The Pioneers The Everlasting Hills Of Oklahoma       2.39
## 5                         Us3                             India       2.39
## # ... with 3 more variables: downloads <int>, confidence <chr>,
## #   confidence.numeric <dbl>
excellent %>% sample_n(10)
## # A tibble: 10 x 6
##                               artist               title mean.price
##                                <chr>               <chr>      <dbl>
##  1                        David Ford          Go To Hell       0.79
##  2                         Sohodolls            Stripper       0.89
##  3                       Eric Burdon           The Dream       0.79
##  4                       Kevin Toney          Satin Doll       0.79
##  5                      Hope Partlow Everywhere But Here       0.89
##  6                           Sham 69      Hurry Up Harry       0.89
##  7                 Master Margherita               Sogno       1.09
##  8                      Dina Carroll             Falling       0.99
##  9                     Eden's Bridge        Here Is Love       0.99
## 10 Christian McBride Inside Straight         Uncle James       0.89
## # ... with 3 more variables: downloads <int>, confidence <chr>,
## #   confidence.numeric <dbl>
excellent %>% sample_frac(0.00001)
## # A tibble: 5 x 6
##             artist               title mean.price downloads confidence
##              <chr>               <chr>      <dbl>     <int>      <chr>
## 1   Whiskey Rebels          Summertime       1.89      9819  excellent
## 2       Janez Detd          Killing Me       1.19      8396  excellent
## 3 Inspiral Carpets           Lovegrove       0.79      5540  excellent
## 4          Diorama    A DIFFERENT LIFE       0.89      6073  very good
## 5     Max Stalling Scars and Souvenirs       0.99      6017  excellent
## # ... with 1 more variables: confidence.numeric <dbl>

Joins

random_data <- 
  excellent %>% 
  select(artist) %>% 
  distinct %>%
  mutate(random_value = rnorm(mean = 100, sd = 50, n = n()))
random_data
## # A tibble: 49,422 x 2
##            artist random_value
##             <chr>        <dbl>
##  1           0131     47.57599
##  2            091    138.82757
##  3  10000 Maniacs    152.29532
##  4  1000 Homo DJs    122.23192
##  5 1000 Homo DJ's    116.61590
##  6  1000 Mexicans     98.71083
##  7      1000names     76.09348
##  8     1000 Names    125.26632
##  9    1000 Robota    150.68128
## 10           1001     58.73839
## # ... with 49,412 more rows
ggplot(random_data, aes(x=random_value)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

joined <- inner_join(excellent, random_data, by="artist")
joined <- left_join(excellent, random_data, by="artist")

joined
## # A tibble: 520,135 x 7
##           artist                   title mean.price downloads confidence
##            <chr>                   <chr>      <dbl>     <int>      <chr>
##  1          0131                Il Danno       1.19      6646  very good
##  2           091        Un Día De Lluvia       0.69      5219  excellent
##  3 10000 Maniacs         A Campfire Song       0.89      6548  excellent
##  4 10000 Maniacs       Across The Fields       1.09      7310  excellent
##  5 10000 Maniacs  All That Never Happens       0.79      6014  excellent
##  6 10000 Maniacs     Among The Americans       1.09      7325  excellent
##  7 10000 Maniacs Anthem For Doomed Youth       0.59      5109  very good
##  8 10000 Maniacs               Arbor Day       1.09      7298  excellent
##  9 10000 Maniacs   A Room For Everything       0.79      6027  excellent
## 10 10000 Maniacs        Back O' The Moon       1.29      8165  excellent
## # ... with 520,125 more rows, and 2 more variables:
## #   confidence.numeric <dbl>, random_value <dbl>
ggplot(joined, aes(x=random_value)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Aggregation

c <- joined %>% group_by(artist) %>% count

c
## # A tibble: 49,422 x 2
## # Groups:   artist [49,422]
##            artist     n
##             <chr> <int>
##  1           0131     1
##  2            091     1
##  3  10000 Maniacs    61
##  4  1000 Homo DJs     3
##  5 1000 Homo DJ's     1
##  6  1000 Mexicans     8
##  7      1000names    36
##  8     1000 Names     1
##  9    1000 Robota     5
## 10           1001     1
## # ... with 49,412 more rows
ggplot(c, aes(x=n)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

joined %>% group_by(artist) %>% 
  summarise(
    median.price = median(mean.price), 
    total.downloads = sum(downloads), 
    mean.downloads = mean(downloads), 
    mean.confidence = mean(confidence.numeric), 
    confidence_list = list(confidence))
## # A tibble: 49,422 x 6
##            artist median.price total.downloads mean.downloads
##             <chr>        <dbl>           <int>          <dbl>
##  1           0131         1.19            6646       6646.000
##  2            091         0.69            5219       5219.000
##  3  10000 Maniacs         0.89          412593       6763.820
##  4  1000 Homo DJs         1.09           21469       7156.333
##  5 1000 Homo DJ's         0.69            5065       5065.000
##  6  1000 Mexicans         0.94           43736       5467.000
##  7      1000names         0.89          209537       5820.472
##  8     1000 Names         1.39            7808       7808.000
##  9    1000 Robota         0.99           33338       6667.600
## 10           1001         0.99            7092       7092.000
## # ... with 49,412 more rows, and 2 more variables: mean.confidence <dbl>,
## #   confidence_list <list>

SparkR

library(SparkR)
sparkR.session()
sample <- downloads %>% sample_n(1000) %>% as.data.frame
df <- createDataFrame(sample, schema=NULL)
df %>% head(5) %>% as_tibble
df %>% select(artist) %>% head(5)
df %>% filter(df$artist == "Madonna") %>% head(5)
df %>% group_by(df$artist) %>% summarize(mean_price = median(df$mean_price)) %>% collect