# -*- coding: utf-8 -*-
########################################################################
## Check that offense category is reasonably well distributed across   #
## ... the samples created in tenfold-crossvalidation.py               #
## Note: recommend running with pypy, much much faster.                #
########################################################################

import json,os

# Preliminaries: where files reside, which category to check for
indirname = '../baileyfiles/'
offensedict_fn = indirname + 'offensedict.json'
offensecat = 'breakingpeace' #change to target category
broadcat = True #set true if category is e.g. "theft" instead of "theft-simplelarceny"

# this is a helper to flatten a list of lists, used below
flatten = lambda x: [y for l in x for y in flatten(l)] if type(x) is list else [x]

 
with open(offensedict_fn,'r') as f0:
    offensedict = json.loads(f0.read())
if broadcat:
    mylist = [value for key, value in offensedict.iteritems() if key.startswith(offensecat)]
    trials_in_cat = flatten(mylist)
else:
    trials_in_cat = offensedict[offensecat]


# Now get the samples and count instances of this offense
# Note that in the loop below, the 'strip' in creating samptriallist is 
# ...necessary because otherwise each list item will have '\n' appended, and 
# ...so the comparison in "if tr in trials_in_cat" will not work

instancetotal = 0
sampledirname = '../baileyfiles/Samples_1830s/'

print 'Offense category checked for: %s' % (offensecat)
for fn in sorted(os.listdir(sampledirname)):
    instances = 0
    currentfile = sampledirname + fn 
    with open(currentfile) as f1:
        samptriallist = [line.strip() for line in f1]
    for tr in samptriallist:
        if tr in trials_in_cat: 
            instances += 1
    print "%s: %d" % (fn, instances)
    instancetotal += instances

