From 0bf85992da090b624fb915cb87a1323736b6a67c Mon Sep 17 00:00:00 2001
From: ichuang <ichuang@mit.edu>
Date: Sat, 8 Sep 2012 22:32:28 -0400
Subject: [PATCH] psychometrics djangoapp

---
 lms/djangoapps/psychometrics/__init__.py      |   0
 lms/djangoapps/psychometrics/admin.py         |   8 +
 .../psychometrics/management/__init__.py      |   0
 .../management/commands/__init__.py           |   0
 .../management/commands/init_psychometrics.py |  66 ++++
 lms/djangoapps/psychometrics/models.py        |  45 +++
 lms/djangoapps/psychometrics/psychoanalyze.py | 312 ++++++++++++++++++
 7 files changed, 431 insertions(+)
 create mode 100644 lms/djangoapps/psychometrics/__init__.py
 create mode 100644 lms/djangoapps/psychometrics/admin.py
 create mode 100644 lms/djangoapps/psychometrics/management/__init__.py
 create mode 100644 lms/djangoapps/psychometrics/management/commands/__init__.py
 create mode 100644 lms/djangoapps/psychometrics/management/commands/init_psychometrics.py
 create mode 100644 lms/djangoapps/psychometrics/models.py
 create mode 100644 lms/djangoapps/psychometrics/psychoanalyze.py

diff --git a/lms/djangoapps/psychometrics/__init__.py b/lms/djangoapps/psychometrics/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lms/djangoapps/psychometrics/admin.py b/lms/djangoapps/psychometrics/admin.py
new file mode 100644
index 0000000000..ff1a14d722
--- /dev/null
+++ b/lms/djangoapps/psychometrics/admin.py
@@ -0,0 +1,8 @@
+'''
+django admin pages for courseware model
+'''
+
+from psychometrics.models import *
+from django.contrib import admin
+
+admin.site.register(PsychometricData)
diff --git a/lms/djangoapps/psychometrics/management/__init__.py b/lms/djangoapps/psychometrics/management/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lms/djangoapps/psychometrics/management/commands/__init__.py b/lms/djangoapps/psychometrics/management/commands/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lms/djangoapps/psychometrics/management/commands/init_psychometrics.py b/lms/djangoapps/psychometrics/management/commands/init_psychometrics.py
new file mode 100644
index 0000000000..b7c9779d08
--- /dev/null
+++ b/lms/djangoapps/psychometrics/management/commands/init_psychometrics.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+#
+# generate pyschometrics data from tracking logs and student module data
+
+import os, sys, string
+import datetime
+import json
+
+from courseware.models import *
+from track.models import *
+from psychometrics.models import *
+from xmodule.modulestore import Location
+
+from django.core.management.base import BaseCommand
+
+#db = "ocwtutor"	# for debugging
+db = "default"
+
+class Command(BaseCommand):
+    help = "initialize PsychometricData tables from StudentModule instances (and tracking data, if in SQL)."
+    help += "Note this is done for all courses for which StudentModule instances exist."
+
+    def handle(self, *args, **options):
+
+        # delete all pmd
+
+        #PsychometricData.objects.all().delete()
+        #PsychometricData.objects.using(db).all().delete()
+
+        smset = StudentModule.objects.using(db).exclude(max_grade=None)
+        
+        for sm in smset:
+            url = sm.module_state_key
+            location = Location(url)
+            if not location.category=="problem":
+                continue
+            try:
+                state = json.loads(sm.state)
+                done = state['done']
+            except:
+                print "Oops, failed to eval state for %s (state=%s)" % (sm,sm.state)
+                continue
+            
+            if done:			# only keep if problem completed
+                try:
+                    pmd = PsychometricData.objects.using(db).get(studentmodule=sm)
+                except PsychometricData.DoesNotExist:
+                    pmd = PsychometricData(studentmodule=sm)
+        
+                pmd.done = done
+                pmd.attempts = state['attempts']
+        
+                # get attempt times from tracking log
+                uname = sm.student.username
+                tset = TrackingLog.objects.using(db).filter(username=uname, event_type__contains='save_problem_check')        
+                tset = tset.filter(event_source='server')
+                tset = tset.filter(event__contains="'%s'" % url)
+                checktimes = [x.dtcreated for x in tset]
+                pmd.checktimes = json.dumps(checktimes)
+                if not len(checktimes)==pmd.attempts:
+                    print "Oops, mismatch in number of attempts and check times for %s" % pmd
+        
+                #print pmd
+                pmd.save(using=db)
+        
+        print "%d PMD entries" % PsychometricData.objects.using(db).all().count()
diff --git a/lms/djangoapps/psychometrics/models.py b/lms/djangoapps/psychometrics/models.py
new file mode 100644
index 0000000000..4ffdf59120
--- /dev/null
+++ b/lms/djangoapps/psychometrics/models.py
@@ -0,0 +1,45 @@
+#
+# db model for psychometrics data
+#
+# this data is collected in real time
+#
+
+from django.db import models
+from courseware.models import StudentModule
+
+class PsychometricData(models.Model):
+    """
+    This data is a table linking student, module, and module performance,
+    including number of attempts, grade, max grade, and time of checks.
+
+    Links to instances of StudentModule, but only those for capa problems.
+
+    Note that StudentModule.module_state_key is nominally a Location instance (url string).
+    That means it is of the form {tag}://{org}/{course}/{category}/{name}[@{revision}]
+    and for capa problems, category = "problem".
+
+    checktimes is extracted from tracking logs, or added by capa module via psychometrics callback.
+    """
+
+    studentmodule = models.ForeignKey(StudentModule, db_index=True, unique=True)   # contains student, module_state_key, course_id
+
+    done = models.BooleanField(default=False)
+    attempts = models.IntegerField(default=0)			# extracted from studentmodule.state
+    checktimes = models.TextField(null=True, blank=True)	# internally stored as list of datetime objects
+
+    # keep in mind
+    # grade = studentmodule.grade
+    # max_grade = studentmodule.max_grade
+    # student = studentmodule.student
+    # course_id = studentmodule.course_id
+    # location = studentmodule.module_state_key
+    
+    def __unicode__(self):
+        sm = self.studentmodule
+        return "[PsychometricData] %s url=%s, grade=%s, max=%s, attempts=%s, ct=%s" % (sm.student,
+                                                                                       sm.module_state_key,
+                                                                                       sm.grade,
+                                                                                       sm.max_grade,
+                                                                                       self.attempts,
+                                                                                       self.checktimes)
+    
diff --git a/lms/djangoapps/psychometrics/psychoanalyze.py b/lms/djangoapps/psychometrics/psychoanalyze.py
new file mode 100644
index 0000000000..e8dd7b4684
--- /dev/null
+++ b/lms/djangoapps/psychometrics/psychoanalyze.py
@@ -0,0 +1,312 @@
+#
+# File:   psychometrics/psychoanalyze.py
+#
+# generate pyschometrics plots from PsychometricData 
+
+from __future__ import division
+
+import datetime
+import logging
+import json
+import math
+import numpy as np
+from scipy.optimize import curve_fit
+
+from django.db.models import Sum, Max
+from psychometrics.models import *
+from xmodule.modulestore import Location
+
+log = logging.getLogger("mitx.psychometrics")
+
+#db = "ocwtutor"	# for debugging
+db = "default"
+
+#-----------------------------------------------------------------------------
+# fit functions
+
+def func_2pl(x,a,b):
+    """
+    2-parameter logistic function
+    """
+    D = 1.7
+    edax = np.exp(D*a*(x-b))
+    return edax / (1+edax)
+
+#-----------------------------------------------------------------------------
+# statistics class
+
+class StatVar(object):
+    """
+    Simple statistics on floating point numbers: avg, sdv, var, min, max
+    """
+    def __init__(self,unit=1):
+        self.sum = 0
+        self.sum2 = 0
+        self.cnt = 0
+        self.unit = unit
+        self.min = None
+        self.max = None
+    def add(self,x):
+        if x is None:
+            return
+        if self.min is None:
+            self.min = x
+        else:
+            if x<self.min:
+                self.min = x
+        if self.max is None:
+            self.max = x
+        else:
+            if x>self.max:
+                self.max = x
+        self.sum += x
+        self.sum2 += x**2
+        self.cnt += 1
+    def avg(self):
+        if self.cnt is None:
+            return 0
+        return self.sum / 1.0 / self.cnt / self.unit
+    def var(self):
+        if self.cnt is None:
+            return 0
+        return (self.sum2 / 1.0 / self.cnt / (self.unit**2)) - (self.avg()**2)
+    def sdv(self):
+        v = self.var()
+        if v>0:
+            return math.sqrt(v)
+        else:
+            return 0
+    def __str__(self):
+        return 'cnt=%d, avg=%f, sdv=%f' % (self.cnt,self.avg(),self.sdv())
+    def __add__(self,x):
+        self.add(x)
+        return self
+
+#-----------------------------------------------------------------------------
+# histogram generator
+
+def make_histogram(ydata,bins=None):
+    '''
+    Generate histogram of ydata using bins provided, or by default bins
+    from 0 to 100 by 10.  bins should be ordered in increasing order.
+    
+    returns dict with keys being bins, and values being counts.
+    special: hist['bins'] = bins
+    '''
+    if bins is None:
+        bins = range(0,100,10)
+        
+    nbins = len(bins)
+    hist = dict(zip(bins,[0] * nbins))
+    for y in ydata:
+        for b in bins[::-1]:	# in reverse order
+            if y>b:
+                hist[b] += 1
+                break
+    # hist['bins'] = bins
+    return hist
+    
+#-----------------------------------------------------------------------------
+
+def problems_with_psychometric_data(course_id):
+    '''
+    Return dict of {problems (location urls): count} for which psychometric data is available.
+    Does this for a given course_id.
+    '''
+    pmdset = PsychometricData.objects.using(db).filter(studentmodule__course_id=course_id)
+    plist = [p['studentmodule__module_state_key'] for p in pmdset.values('studentmodule__module_state_key').distinct()]
+    problems = dict( (p,pmdset.filter(studentmodule__module_state_key=p).count()) for p in plist )
+
+    return problems
+
+#-----------------------------------------------------------------------------
+
+def generate_plots_for_problem(problem):
+    
+    pmdset = PsychometricData.objects.using(db).filter(studentmodule__module_state_key=problem)
+    nstudents = pmdset.count()
+    msg = ""
+    plots = []
+
+    if nstudents < 2:
+        msg += "%s nstudents=%d --> skipping, too few" % (problem,nstudents)
+        return msg, plots
+
+    max_grade = pmdset[0].studentmodule.max_grade
+
+    agdat = pmdset.aggregate(Sum('attempts'), Max('attempts'))
+    max_attempts = agdat['attempts__max']
+    total_attempts = agdat['attempts__sum']	# not used yet
+
+    msg += "max attempts = %d" % max_attempts
+
+    xdat = range(1,max_attempts+1)
+    dataset = {'xdat': xdat}
+
+    # generate grade histogram
+    ghist = []
+
+    axisopts = """{
+        xaxes: [{
+            axisLabel: 'Grade'
+        }],
+        yaxes: [{
+            position: 'left',
+            axisLabel: 'Count'
+         }]
+         }"""
+
+    if max_grade > 1:
+        ghist = make_histogram([pmd.studentmodule.grade for pmd in pmdset],np.linspace(0,max_grade,max_grade+1))
+        ghist_json = json.dumps(ghist.items())
+
+        plot = {'title': "Grade histogram for %s" % problem,
+                'id': 'histogram',
+                'info': '',
+                'data': "var dhist = %s;\n" % ghist_json,
+                'cmd': "[ {data: dhist, bars: { show: true }} ], %s" % axisopts,
+                }
+        plots.append(plot)
+    else:
+        msg += "<br/>Not generating histogram: max_grade=%s" % max_grade
+
+    # histogram of time differences between checks
+    # Warning: this is inefficient - doesn't scale to large numbers of students
+    dtset = []	# time differences in minutes
+    dtsv = StatVar()
+    for pmd in pmdset:
+        try:
+            checktimes = eval(pmd.checktimes)		        # update log of attempt timestamps
+        except:
+            continue
+        if len(checktimes)<2:
+            continue
+        ct0 = checktimes[0]
+        for ct in checktimes[1:]:
+            dt = (ct-ct0).total_seconds()/60.0
+            if dt<20:			# ignore if dt too long
+                dtset.append(dt)
+                dtsv += dt
+            ct0 = ct
+    if dtsv.cnt > 2:
+        msg += "<br/>time differences between checks: %s" % dtsv
+        bins = np.linspace(0,1.5*dtsv.sdv(),30)
+        dbar = bins[1]-bins[0]
+        thist = make_histogram(dtset,bins)
+        thist_json = json.dumps(sorted(thist.items(), key=lambda(x): x[0]))
+
+        axisopts = """{ xaxes: [{ axisLabel: 'Time (min)'}], yaxes: [{position: 'left',axisLabel: 'Count'}]}"""
+
+        plot = {'title': "Histogram of time differences between checks",
+                'id': 'thistogram',
+                'info': '',
+                'data': "var thist = %s;\n" % thist_json,
+                'cmd': "[ {data: thist, bars: { show: true, barWidth:%f }} ], %s" % (dbar, axisopts),
+                }
+        plots.append(plot)
+
+    # one IRT plot curve for each grade received (TODO: this assumes integer grades)
+    for grade in range(1,int(max_grade)+1):
+        yset = {}
+        gset = pmdset.filter(studentmodule__grade=grade)
+        ngset = gset.count()
+        if ngset==0:
+            continue
+        ydat = []
+        ylast = 0
+        for x in xdat:
+            y = gset.filter(attempts=x).count()/ngset
+            ydat.append( y + ylast )
+            ylast = y + ylast
+        yset['ydat'] = ydat
+
+        if len(ydat)>5:		# try to fit to logistic function if enough data points
+            cfp = curve_fit(func_2pl, xdat, ydat, [1.0, max_attempts/2.0])
+            yset['fitparam'] = cfp
+            yset['fitpts'] = func_2pl(np.array(xdat),*cfp[0])
+            yset['fiterr'] = [yd-yf for (yd,yf) in zip(ydat,yset['fitpts'])]
+            fitx = np.linspace(xdat[0],xdat[-1],100)
+            yset['fitx'] = fitx
+            yset['fity'] = func_2pl(np.array(fitx),*cfp[0])
+
+        dataset['grade_%d' % grade] = yset
+
+    axisopts = """{
+        xaxes: [{
+            axisLabel: 'Number of Attempts'
+        }],
+        yaxes: [{
+            max:1.0,
+            position: 'left',
+            axisLabel: 'Probability of correctness'
+         }]
+         }"""
+
+    # generate points for flot plot
+    for grade in range(1,int(max_grade)+1):
+        jsdata = ""
+        jsplots = []
+        gkey = 'grade_%d' % grade
+        if gkey in dataset:
+            yset = dataset[gkey]
+            jsdata += "var d%d = %s;\n" % (grade,json.dumps(zip(xdat,yset['ydat'])))
+            jsplots.append('{ data: d%d, lines: { show: false }, points: { show: true}, color: "red" }' % grade)
+            if 'fitpts' in yset:
+                jsdata += 'var fit = %s;\n' % (json.dumps(zip(yset['fitx'],yset['fity'])))
+                jsplots.append('{ data: fit,  lines: { show: true }, color: "blue" }')
+                (a,b) = yset['fitparam'][0]
+                irtinfo = "(2PL: D=1.7, a=%6.3f, b=%6.3f)" % (a,b)
+            else:
+                irtinfo = ""
+
+            plots.append({'title': 'IRT Plot for grade=%s %s' % (grade,irtinfo),
+                          'id': "irt%s" % grade,
+                          'info': '',
+                          'data': jsdata,
+                          'cmd' : '[%s], %s' % (','.join(jsplots), axisopts),
+                })
+
+    #log.debug('plots = %s' % plots)
+    return msg, plots
+
+#-----------------------------------------------------------------------------
+
+def make_psychometrics_data_update_handler(studentmodule):
+    """
+    Construct and return a procedure which may be called to update
+    the PsychometricsData instance for the given StudentModule instance.
+    """
+    sm = studentmodule
+    try:
+        pmd = PsychometricData.objects.using(db).get(studentmodule=sm)
+    except PsychometricData.DoesNotExist:
+        pmd = PsychometricData(studentmodule=sm)
+
+    def psychometrics_data_update_handler(state):
+        """
+        This function may be called each time a problem is successfully checked
+        (eg on save_problem_check events in capa_module).
+
+        state = instance state (a nice, uniform way to interface - for more future psychometric feature extraction)
+        """
+        try:
+            state = json.loads(sm.state)
+            done = state['done']
+        except:
+            log.exception("Oops, failed to eval state for %s (state=%s)" % (sm,sm.state))
+            return
+
+        pmd.done = done
+        pmd.attempts = state['attempts']
+        try:
+            checktimes = eval(pmd.checktimes)		        # update log of attempt timestamps
+        except:
+            checktimes = []
+        checktimes.append(datetime.datetime.now())
+        pmd.checktimes = checktimes
+        try:
+            pmd.save()
+        except:
+            log.exception("Error in updating psychometrics data for %s" % sm)
+
+    return psychometrics_data_update_handler