import pandas
from holoviews import *
from holoviews.interface.seaborn import DFrame
import numpy as np
%load_ext holoviews.ipython
marking_df = pandas.read_csv('./Student_Marks.csv')[0:84]
marking_df['Total'] = [float(t.split('%')[0]) for t in marking_df['Total']]
marking_df['Index'] = range(1,len(marking_df)+1)
%output size=200
First things first we'll check whether the marking process didn't suffer from considerable bias over time, which is possible since the scores are indexed consecutively according to how I marked them.
%%opts Regression.By_Index [apply_databounds=True aspect=0.6] Regression.By_Group [apply_databounds=True aspect=0.05]
DFrame(marking_df).regression('Index', 'Total', group='By_Index', reduce_fn=np.mean, extents=(1,0,84,100)) +\
DFrame(marking_df).regression('Tutorial Group', 'Total', group='By_Group', reduce_fn=np.mean, extents=(1,0,8,100))
As we can see the data does not show any trend that isn't explained by the variability in performance between the Tutorial Groups.
%%output dpi=120
%%opts Regression [apply_databounds=True]
DFrame(marking_df).regression('Index', 'Total', mdims=['Tutorial Group'], extents=(None, 0, None, 100)).layout(['Tutorial Group'])
We can break this down by Tutorial Group to check if there are any trends in each group, we may have missed. Trends that carry through multiple groups would be indicative of bias, this is not the case.
%%opts Distribution (bins=20 hist_kws={'range':(0,100)} kde_kws={'cut':0} rug=True color='indianred') DFrame (cut=0)
Distribution(marking_df.Total, key_dimensions=['Total Score']) +\
DFrame(marking_df, group='Total Score', plot_type='violinplot', x='Tutorial Group', y='Total')
Now the mean total score broken down by Tutorial Group.
%%opts DFrame (kind='bar' aspect=2)
DFrame(marking_df, plot_type='factorplot', x='Tutorial Group', y='Total')
Finally a breakdown into degree classification:
first = sum(marking_df.Total > 70)
twoone = sum((60 <= marking_df.Total) & (marking_df.Total < 70))
twotwo = sum((50 <= marking_df.Total) & (marking_df.Total < 60))
third = sum((40 <= marking_df.Total) & (marking_df.Total< 50))
fail = sum(marking_df.Total < 40)
nstudents = float(len(marking_df.Total))
print "Total with a First: {} ({} %)".format(first, 100*first/float(nstudents))
print "Total with a 2:1: {} ({} %)".format(twoone, 100*twoone/float(nstudents))
print "Total with a 2:2: {} ({} %)".format(twotwo, 100*twotwo/float(nstudents))
print "Total with a Third: {} ({} %)".format(third, 100*third/float(nstudents))
print "Total with a Fail: {} ({} %)".format(fail, 100*fail/float(nstudents))
And here are some basic stats:
marking_df.Total.describe()
answers = [c for c in marking_df.columns if 'A1' in c]
stacked = []
for sn, group in marking_df.groupby('Student Number'):
group_df = pandas.DataFrame(group.filter(answers).stack()).reset_index(level=1)
group_df.insert(0, 'Student Number', sn)
tg = list(marking_df[marking_df['Student Number'] == sn]['Tutorial Group'])[0]
group_df.insert(0, 'Tutorial Group', tg)
stacked.append(group_df)
stacked_answers_df = pandas.concat(stacked)
stacked_answers_df.columns =['Tutorial Group', 'Student Number', 'Answer', 'Result']
stacked_answers_df['Answer'] = [int(q[1]) for q in stacked_answers_df['Answer']]
stacked_answers_df.index = list(range(len(stacked_answers_df)))
Let's get an overview of the Distributions of answers.
%%output size=100 dpi=100
np.sum([DFrame(stacked_answers_df, plot_type='boxplot',
x='Answer', y='Result', label='Answer %d' % a).select(Answer=a)
for a in set(stacked_answers_df.Answer)])
The correct or acceptable answers were:
2. 0.0479
3. 0/0.1712/0.2166
4. 0
5. 3/1000
7. 0.1667
9. 0.01-0.2
questions = ['Question %d' % q for q in range(1,12)]
stacked = []
for idx, (sn, group) in enumerate(marking_df.groupby('Student Number')):
group_df = pandas.DataFrame(group.filter(questions).stack()).reset_index(level=1)
group_df.insert(0, 'Student Number', sn)
tg = list(marking_df[marking_df['Student Number'] == sn]['Tutorial Group'])[0]
group_df.insert(0, 'Tutorial Group', tg)
group_df.insert(0, 'Index', idx)
stacked.append(group_df)
stacked_df = pandas.concat(stacked)
stacked_df.columns =['Index', 'Tutorial Group', 'Student Number', 'Question', 'Mark']
stacked_df['Question'] = [int(q[-2:].strip()) for q in stacked_df['Question']]
stacked_df.index = list(range(len(stacked_df)))
qdim = Dimension('Question', type=int)
print "The overall mean score per question was:", stacked_df.Mark.mean()
print "With a standard deviation of:", stacked_df.Mark.std()
%%output size=300
%%opts DFrame (kind='point' aspect=2 palette='Set2')
DFrame(stacked_df, plot_type='factorplot', x='Question', y='Mark', x2='Tutorial Group')
And again a broken down individually:
%%output dpi=120
%%opts DFrame (kind='point' hue='Tutorial Group' col='Tutorial Group' palette='Set2' col_wrap=4)
DFrame(stacked_df, plot_type='factorplot', x='Question', y='Mark')
%%output size=400 dpi=80
%%opts Bars [aspect=25.0 xrotation=90]
DFrame(stacked_df, dimensions={'Question': qdim}).bars('Index', 'Mark', reduce_fn=np.mean)
%%output dpi=120
%%opts DFrame (kind='point' hue='Index' col='Index' palette='Set2' col_wrap=4)
DFrame(stacked_df, plot_type='factorplot', x='Question', y='Mark')