R Programming Language Tutorial Videos on YouTube

See R Programming Language Tutorial Videos by Hendy Irawan.

Installing Packages: libcurl4-openssl-dev, TwitteR, httpuv, tm, wordcloud, RColorBrewer

Install Ubuntu package libcurl4-openssl-dev required by RCurl R package:

sudo aptitude install libcurl4-openssl-dev

Install the R packages:

install.packages(c('TwitteR', 'httpuv', 'tm', 'wordcloud'))

Install ggplot2

install.packages('ggplot2')

Load Data

Via Twitter API:

tl_dakwatuna <- userTimeline('dakwatuna', n=1000, includeRts = TRUE)
tl_dakwatuna.df <- twListToDF(tl_dakwatuna)
tl_dakwatuna.df$text <- iconv(tl_dakwatuna.df$text, to='utf-8')
write.csv(tl_dakwatuna.df, '~/git/r-tutorials/tl_dakwatuna_2015-04-03.csv')

or offline from dataset:

tl_dakwatuna.df <- read.csv('tl_dakwatuna_2015-04-03_tagged.csv', row.names=1)

Retweets vs Favorites

Histogram:

library(ggplot2)

ggplot(tl_dakwatuna.df, aes(x=retweetCount)) + geom_histogram(binwidth=0.1, fill='green') + geom_smooth(stat='bin', binwidth=0.1, colour='darkgreen') + geom_histogram(aes(x=favoriteCount), fill='blue', binwidth=0.1, alpha=0.5) + scale_x_log10() + ggtitle('@dakwatuna engagement')

Scatterplot:

library(ggplot2)

ggplot(tl_dakwatuna.df, aes(x=retweetCount, y=favoriteCount)) + geom_jitter() + scale_x_log10() + scale_y_log10() + ggtitle('@dakwatuna retweets vs favorites')

Tweets, Retweets, Favorites, and their ratios by Hour-of-Day

Make the hourOfDay column as factors (if necessary):

df <- tl_dakwatuna.df
#df$hourOfDay = factor(df$hourOfDay)
summary(df$hourOfDay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   10.00   14.00   13.86   18.00   23.00

Make a data frame having the counts per group:

library(plyr)
library(ggplot2)

byHourOfDay <-ddply(df, ~hourOfDay, summarise,
                    tweetCount = length(text),
                    retweetSum = sum(retweetCount), retweetMean = mean(retweetCount), retweetSd = sd(retweetCount), retweetMax = max(retweetCount),
                    favoriteSum = sum(favoriteCount), favoriteMean = mean(favoriteCount), favoriteSd = sd(favoriteCount), favoriteMax = max(favoriteCount),
                    favoriteRatioSum = sum(as.numeric(favoriteRatio)))
#View(byHourOfDay)

engagementScale <- 10
ggplot(byHourOfDay, aes(x=hourOfDay, y=tweetCount, fill=hourOfDay)) + 
  ylim(0, max(byHourOfDay$tweetCount)) +
  geom_bar(stat='identity') + 
  geom_pointrange(aes(y=engagementScale*retweetMean, ymin=engagementScale*pmax(retweetMean-retweetSd, 0), ymax=engagementScale*(retweetMean+retweetSd), size=favoriteRatioSum), color='green') +
  geom_smooth(method='loess', aes(y=engagementScale*retweetMean), fill='green', color='green') +
  geom_pointrange(aes(y=engagementScale*favoriteMean, ymin=engagementScale*pmax(favoriteMean-favoriteSd, 0), ymax=engagementScale*(favoriteMean+favoriteSd)), size=1, stat='identity', color='orange') +
  geom_smooth(method='loess', aes(y=engagementScale*favoriteMean), fill='orange', color='orange') +
  scale_x_continuous(breaks=byHourOfDay$hourOfDay) +
  ggtitle('@dakwatuna tweets by hour-of-day')
## Warning: Removed 1 rows containing missing values (geom_segment).

Tweets, Retweets, Favorites, and their ratios by Day-of-Week

library(plyr)
library(ggplot2)

byDayOfWeek <-ddply(df, ~dayOfWeek, summarise,
                    tweetCount = length(text),
                    retweetSum = sum(retweetCount), retweetMean = mean(retweetCount), retweetSd = sd(retweetCount), retweetMax = max(retweetCount),
                    favoriteSum = sum(favoriteCount), favoriteMean = mean(favoriteCount), favoriteSd = sd(favoriteCount), favoriteMax = max(favoriteCount),
                    favoriteRatioSum = sum(as.numeric(favoriteRatio)))
#View(byDayOfWeek)

dayOfWeeks <- c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')
twitterScreenName <- 'dakwatuna'
engagementScale <- 10
ggplot(byDayOfWeek, aes(x=dayOfWeek, y=tweetCount, fill=dayOfWeek)) + 
  ylim(0, max(byDayOfWeek$tweetCount)) +
  geom_bar(stat='identity') + 
  geom_pointrange(aes(y=engagementScale*retweetMean, ymin=engagementScale*pmax(retweetMean-retweetSd, 0), ymax=engagementScale*(retweetMean+retweetSd), size=favoriteRatioSum), color='green') +
  geom_pointrange(aes(y=engagementScale*favoriteMean, ymin=engagementScale*pmax(favoriteMean-favoriteSd, 0), ymax=engagementScale*(favoriteMean+favoriteSd)), size=1, stat='identity', color='orange') +
  scale_x_discrete(limits=dayOfWeeks) +
  ggtitle(paste('@', twitterScreenName, ' tweets by day-of-week', sep=''))
## Warning: Removed 1 rows containing missing values (geom_segment).