# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('data.csv', sep='\t')
data.head()
import datetime
# just converting dates to a standard format
release_dates = pd.to_datetime(data['Release Date'])
data['Release Date'] = release_dates
date_group = data.groupby([data['Release Date'].dt.year]).count()
date_group
plt.plot(date_group['Release Date'].index, date_group['HG Story Title'], marker='x')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.title('Stories released per year')
plt.grid(True)
plt.xticks(date_group['Release Date'].index, rotation=45)
plt.show()
Overall, the trend is increasing until a peak in 2018. Data for 2021 is incomplete.
genre_group = data[['Genre', 'HG Story Title']].groupby('Genre').count()
genre_group
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
fig = go.Figure(data=[go.Pie(labels=genre_group.index, values=genre_group['HG Story Title'])])
iplot(fig)
plt.figure(figsize=(10, 8))
plt.barh(genre_group.index, genre_group['HG Story Title'])
plt.xlabel('Count')
plt.ylabel('Genre')
plt.title('Stories released per genre')
plt.grid(True, axis='x')
plt.show()
Fantasy is the most common genre, followed by Supernatural and Sci-Fi.
data['Popularity'] = data['# of Omnibus Ratings'] + data['# of Google Reviews']
px.scatter(data, x='# of Omnibus Ratings', y='# of Google Reviews', hover_data=['HG Story Title'], trendline='ols')
This doesn't look so good but let's just calculate a correlation...
import statsmodels.api as sm
import statsmodels.formula.api as smf
It turns out that some of the games don't have Google review data. Which are those games?
data[data['# of Google Reviews'].isna()]
Alright, let's just remove those games from the data.
data_clean = data[~data['# of Google Reviews'].isna()]
results = sm.OLS(data_clean['# of Omnibus Ratings'], sm.add_constant(data_clean['# of Google Reviews'])).fit()
print(results.summary())
That's a lot of numbers! Here I am attempting to discern the relationship between
An important number here is R-squared. Basically, this shows the strength of the relationship between the two variables. It's 0.496, which is pretty decent. So there definitely is a relationship between the two popularity metrics. But I'm guessing the outlier kind of skews the results... What is the outlier, anyway? That seems to be by far the most popular game published by Hosted Games.
data_popular = data.sort_values('Popularity', ascending=False)
data_popular.head(10)
The outlier that we saw earlier was The Great Tournament, which is the most popular Hosted Game.
By most metrics, Philip Kempton's The Great Tournament and Life of a Mercenary are the two most popular Hosted Games. Congrats I guess.
Okay, now let's plot a histogram of popularity:
px.histogram(data, x='Popularity', nbins=50)
results = sm.OLS(data_clean['Popularity'], sm.add_constant(data_clean['Word Count'])).fit()
print(results.summary())
px.scatter(data_clean, x='Word Count', y='Popularity', hover_data=['HG Story Title'], trendline='ols')
There's definitely a positive relationship between word count and popularity, but the R^2 between word count and popularity is about 0.08, which is... very weak.
What happens if we remove The Great Tournament?
data_no_outliers = data_clean[data_clean['HG Story Title'] != 'The Great Tournament']
results = sm.OLS(data_no_outliers['Popularity'], sm.add_constant(data_no_outliers['Word Count'])).fit()
print(results.summary())
Now, the R^2 improves to 0.193, which is... better?
The regression equation is $Popularity = 0.066*WordCount + 471.8$
This means that, for example, the expected popularity of a 500k-word story would be about 3800.
px.scatter(data_no_outliers, x='Word Count', y='Popularity', hover_data=['HG Story Title'], trendline='ols')
px.scatter(data_no_outliers, x='Word Count', y='Popularity', hover_data=['HG Story Title'], trendline='ols', log_x=True, log_y=True)
We might expect earlier released games to be more popular, just because they've had more time to acquire readers. But we might also expect later released games to be more popular, because of greater visibility for HG in more recent years, and the more recent release of the omnibus app. Or maybe there is no relation at all. Which hypothesis is true?
plt.scatter(data['Release Date'], data['Popularity'])
# okay that wasn't very helpful... but what is the outlier? (It's The Great Tournament)
px.scatter(data, x='Release Date', y='Popularity', hover_data=['HG Story Title'], trendline='ols')
# what about within the omnibus only?
px.scatter(data_no_outliers, x='Release Date', y='# of Omnibus Ratings', hover_data=['HG Story Title'], trendline='ols')
# what about within GPS only? and we remove The Great Tournament?
px.scatter(data_no_outliers, x='Release Date', y='# of Google Reviews', hover_data=['HG Story Title'], trendline='ols')
It doesn't look like there's much of a relationship between popularity and release date.
Let's plot the average popularity per genre:
genre_pop_group = data_clean[['Genre', 'Popularity']].groupby('Genre').mean()
px.bar(genre_pop_group, x=genre_pop_group.index, y='Popularity')
#Let's do a box plot with error bars...
px.box(data_clean, x='Genre', y='Popularity', hover_data=['HG Story Title'])
# log popularity by genre
games_plot = px.box(data_clean, x='Genre', y='Popularity', hover_data=['HG Story Title'], log_y=True, points='all')
games_plot.show()
games_plot.write_html('./log_popularity_by_genre.html')
px.strip(data_clean, x='Genre', y='Popularity', hover_data=['HG Story Title'], log_y=True)
#Let's do a box plot with The Great Tournament removed...
px.box(data_no_outliers, x='Genre', y='Popularity', hover_data=['HG Story Title'])
Superatural, Fantasy, and Superhero are the most popular genres on average. They are also the most popular genres in total, so writers' interests and readers' interests seem to overlap for the most part. Interestingly, it seems that Superhero has the highest floor of popularity: the least popular Superhero game has a popularity of 419, which is the highest minimum popularity of any genre!
We'll introduce dummy variables for all of the genres, as well as "Free?". Whatever, let's add Release Year, and author_count as well.
# 1. Create categorical variables to represent genre inclusions
genre_cols = []
genre_cols_quoted = []
for genre in set(data['Genre']):
data['is_' + genre] = [int(x) for x in (data['Genre'] == genre)]
genre_cols_quoted.append('"is_' + genre + '"')
genre_cols.append('is_'+genre)
# 2. identify author release count
known_authors = data.groupby('Author').count()
known_authors_dict = {r[0] : r[1]['HG Story Title'] for r in known_authors.iterrows()}
data['author_count'] = [known_authors_dict[a] - 1 for a in data['Author']]
data['is_free'] = [int(x) for x in (data['Free?'] == 'Yup')]
data['year'] = data['Release Date'].dt.year
data['post_2018'] = [int(x) for x in (data['Release Date'].dt.year >= 2018)]
data_clean = data[~data['# of Google Reviews'].isna()]
data_clean_X = data_clean[['Word Count', 'is_free', 'author_count', 'post_2018'] + genre_cols]
data_no_outliers = data_clean[data_clean['HG Story Title'] != 'The Great Tournament']
data_noo_X = data_no_outliers[['Word Count', 'is_free', 'author_count', 'post_2018'] + genre_cols]
results = sm.OLS(data_clean['Popularity'], sm.add_constant(data_clean_X)).fit()
print(results.summary())
Now, what the heck does this mean?
If we interpret the linear regression coeffients very literally...
THESE DESCRIPTIONS ARE OUTDATED AS THEY DO NOT INCLUDE THE POST_2018 OR AUTHOR_COUNT TERMS
None of the other conclusions are significant. Based on this analysis, we cannot conclusively say that, for example, puzzle or adventure or horror games will have a penalty to popularity, simply because there are too few games in those categories. Similarly, we can't say that school or post-apocalyptic games will have a bonus to popularity.
Now, what if we remove The Great Tournament?
results = sm.OLS(data_no_outliers['Popularity'], sm.add_constant(data_noo_X)).fit()
print(results.summary())
These results actually look quite a bit better. Here's a summary:
THESE DESCRIPTIONS ARE OUTDATED AS THEY DO NOT INCLUDE THE POST_2018 OR AUTHOR_COUNT TERMS
These are all the "significant" conclusions here. According to the regression coefficients, puzzle, adventure, sci-fi, horror, historical, and steampunk all have penalties, while war, school, and post-apocalyptic all have slight bonuses. But none of those are "significant", because the sample size is very, very small.
We can look at the residuals, which are the true value minus the predicted value. A positive residual indicates that a game was more popular than predicted, while a negative residual indicates that a game was less popular than predicted.
predictions = results.predict(sm.add_constant(data_noo_X))
px.scatter(data_no_outliers, x='Popularity', y=predictions, hover_data=['HG Story Title'], labels={'y': 'Predicted popularity'})
px.scatter(data_no_outliers, x='Popularity', y=results.resid, hover_data=['HG Story Title'], labels={'y': 'Residual'})
data_no_outliers['resid'] = results.resid
more_popular = data_no_outliers.sort_values('resid', ascending=False)
more_popular.head(10)
The games that are more popular than expected are, in order (not including The Great Tournament): Life of a Mercenary, Wayhaven 1, Wayhaven 2, Hero or Villain, Doomsday on Demand, Samurai of Hyuga, The Parenting Similator, War for the West, The Aether, and Evertree Inn.
less_popular = data_no_outliers.sort_values('resid', ascending=True)
less_popular.head(10)
The games that are less popular than expected are, in order: Tin Star, Magikiras, Gambling with Eternity, Elemental Saga, Burn(t), Seven Bullets, Twin Flames, Lost in the Pages, The Aegis Saga, and Best of Us.
What if we added data for secondary genres as well as primary genres?
# 1. Create categorical variables to represent genre inclusions
data_subgenres = data.copy()
genre_cols = []
genre_cols_quoted = []
for genre in set(data['Genre']):
data_subgenres['is_' + genre] = [int(x) for x in ((data['Genre'] == genre) | (data['Subgenre (if applicable)'] == genre))]
genre_cols_quoted.append('"is_' + genre + '"')
genre_cols.append('is_'+genre)
data_sg_clean = data_subgenres[~data_subgenres['# of Google Reviews'].isna()]
data_sg_clean_X = data_sg_clean[['Word Count', 'is_free'] + genre_cols]
data_sg_no_outliers = data_sg_clean[data_clean['HG Story Title'] != 'The Great Tournament']
data_sg_noo_X = data_sg_no_outliers[['Word Count', 'is_free', 'author_count', 'post_2018'] + genre_cols]
results = sm.OLS(data_sg_no_outliers['Popularity'], sm.add_constant(data_sg_noo_X)).fit()
print(results.summary())
Well, the $R^2$ is a bit higher here, at 0.363 vs 0.351 for the primary genre-only model. So that's good, indicating that the sub-genre provides some additional information. The coefficients for Word Count and is_free are pretty similar as before.
The interesting thing is that it totally changes the directions of some of the coefficients for the genres:
In order of highest to lowest predicted popularity gain, the genres are:
and these genres are predicted to cause popularity loss:
(* means that the effect of that genre is statistically significant at p<0.05.)
What if we tried to do a regression on log(Word Count) instead?
import numpy as np
data_no_outliers['log_WC'] = np.log2(data_no_outliers['Word Count'])
data_noo_X = data_no_outliers[['log_WC', 'is_free', 'post_2018', 'author_count'] + genre_cols]
results_loglog = sm.OLS(np.log2(data_no_outliers['Popularity']), sm.add_constant(data_noo_X)).fit()
print(results_loglog.summary())
predictions = results_loglog.predict(sm.add_constant(data_noo_X))
predictions = 2**predictions
px.scatter(data_no_outliers, x='Popularity', y=predictions, hover_data=['HG Story Title'], labels={'y': 'Predictions'})
Is the log regression better than the linear regression? Hard to say. The errors are just as large, but at least there aren't any negative predictions.
Let's just try plotting the Google ratings vs the omnibus ratings.
px.scatter(data_clean, x='GPS Score', y='Omnibus Rating', trendline='ols', hover_data=['HG Story Title'],
title = 'Omnibus vs Google ratings')
Overall, the Omnibus rating and GPS rating correlate very well, with an $R^2$ of 0.7 in the above line.
The regression equation is $OmnibusRating = 0.477*GPS + 2.565$.
Let's plot GPS Score vs GPS ratings count, and Omnibus Rating vs Omnibus rating count.
px.scatter(data_clean, x='# of Google Reviews', y='GPS Score', trendline='lowess', hover_data=['HG Story Title'])
px.scatter(data_clean, x='# of Omnibus Ratings', y='Omnibus Rating', trendline='lowess', hover_data=['HG Story Title'])
From this, it's pretty clear that more popular games tend to have higher ratings on both platforms. Not going to try to do any regressions here.
px.histogram(data_clean, x='Omnibus Rating')
data_clean['Omnibus Rating'].mean()
px.histogram(data_clean, x='GPS Score')
data_clean['GPS Score'].mean()
Okay, we're going to do some more regressions. There's probably a better way to do this than OLS because ratings are in a pretty narrow range.
Let's consider the data without sub-genres first:
data_noo_X = data_no_outliers[['Word Count', 'is_free'] + genre_cols]
results = sm.OLS(data_no_outliers['Omnibus Rating'], sm.add_constant(data_noo_X)).fit()
print(results.summary())
Interpretation:
Genres sorted by rating bonuses:
Adventure Spy Fantasy Superhero War Supernatural Slice of Life Post-Apocalyptic Crime Horror Sci-Fi* Historical Romance Mystery Steampunk
And here are the predicted "negative" genres: Puzzle School Humor
Again, a * indicates statistical significance at p<0.05.
Interestingly, Adventure and Spy were some of the worst performing genres in terms of popularity. It's interesting that they seem to give ratings bonuses in the omnibus (but these are very small sample sizes).
Now, let's use the secondary genres:
data_sg_noo_X = data_sg_no_outliers[['Word Count', 'is_free'] + genre_cols]
results = sm.OLS(data_sg_no_outliers['Omnibus Rating'], sm.add_constant(data_sg_noo_X)).fit()
print(results.summary())
The R^2 here is slightly worse than only using the primary genre.
The trends with regard to genre are... a lot more unclear vs only using the primary genre. None of the results are significant at p<0.05.
genre_ratings_plot = px.box(data_clean, x='Genre', y='Omnibus Rating', hover_data=['HG Story Title'], points='all')
genre_ratings_plot.show()
genre_ratings_plot.write_html('genre_ratings_plot.html')
How are we defining "underrated" here? Let's just say, the games whose ratings are higher than expected given their popularity. I think we should do log-popularity because it looks kinda more linear.
Also, given low rating counts, we should be using Bayesian averaging... but I don't think we have enough information to actually calculate a Bayesian average.
# compute bayesian average ratings
# let's completely arbitrarily set the prior to 4.
C = 20
m = data_clean['Omnibus Rating'].mean()
ba_omnibus_ratings = (C*m + data_clean['Omnibus Rating']*data_clean['# of Omnibus Ratings'])/(C + data_clean['# of Omnibus Ratings'])
C = 20
m = data_clean['GPS Score'].mean()
ba_google_ratings = (C*m + data_clean['GPS Score']*data_clean['# of Google Reviews'])/(C + data_clean['# of Google Reviews'])
data_clean['ba_omnibus_rating'] = ba_omnibus_ratings
data_clean['ba_google_rating'] = ba_google_ratings
data_no_outliers = data_clean[data_clean['HG Story Title'] != 'The Great Tournament']
px.scatter(data_clean, x='# of Omnibus Ratings', y='ba_omnibus_rating', trendline='lowess', hover_data=['HG Story Title'], log_x=True)
px.scatter(data_clean, x='# of Google Reviews', y='ba_google_rating', trendline='lowess', hover_data=['HG Story Title'], log_x=True)
Let's do a linear regression of the omnibus and GPS ratings as a function of log(review count).
omnibus_results = sm.OLS(data_no_outliers['ba_omnibus_rating'], sm.add_constant(np.log2(data_no_outliers['# of Omnibus Ratings']))).fit()
print(omnibus_results.summary())
gps_results = sm.OLS(data_no_outliers['ba_google_rating'], sm.add_constant(np.log2(data_no_outliers['# of Google Reviews']))).fit()
print(gps_results.summary())
omnibus_predictions = omnibus_results.predict(sm.add_constant(np.log2(data_no_outliers['# of Omnibus Ratings'])))
data_no_outliers['omnibus_rating_predictions'] = omnibus_predictions
data_no_outliers['omnibus_resid'] = omnibus_results.resid
gps_predictions = gps_results.predict(sm.add_constant(np.log2(data_no_outliers['# of Google Reviews'])))
data_no_outliers['gps_rating_predictions'] = gps_predictions
data_no_outliers['gps_resid'] = gps_results.resid
data_no_outliers.sort_values('omnibus_resid', ascending=False).head(10)
data_no_outliers.sort_values('gps_resid', ascending=False).head(10)