|
1 | 1 | import json
|
2 | 2 | import codecs
|
| 3 | +import tablib |
| 4 | +from loupe import flatten_dict |
| 5 | +from itertools import chain |
| 6 | +from optparse import OptionParser |
| 7 | +from collections import namedtuple |
| 8 | + |
| 9 | +''' |
| 10 | +http://orange.biolab.si/doc/reference/Orange.data.formats/ |
| 11 | +''' |
| 12 | +OrangeType = namedtuple('OrangeType', 'f_type, flag') |
| 13 | + |
| 14 | +CONTINUOUS = OrangeType('c', '') |
| 15 | +DISCRETE = OrangeType('d', '') |
| 16 | +IGNORE = OrangeType('s', 'i') |
| 17 | +meta_features = {'title': OrangeType('s', 'meta'), |
| 18 | + 'id': OrangeType('s', 'meta'), |
| 19 | + 'ah_topic': OrangeType('s', 'meta'), |
| 20 | + 'ah_current': OrangeType('s', 'meta'), |
| 21 | + 'ah_actions': IGNORE, |
| 22 | + 'assessment': IGNORE |
| 23 | + } |
3 | 24 |
|
4 | 25 |
|
5 | 26 | def load_results(file_name):
|
6 |
| - return [json.loads(line.strip()) for line in codecs.open(file_name)] |
| 27 | + return (json.loads(line.strip()) for line in codecs.open(file_name, encoding='utf-8')) |
| 28 | + |
| 29 | + |
| 30 | +def get_column_names(flat_row_list, count=100): |
| 31 | + if not flat_row_list: |
| 32 | + return [] |
| 33 | + all_keys = [f.iterkeys() for f in flat_row_list[:count]] |
| 34 | + column_names = set(chain.from_iterable(all_keys)) |
| 35 | + return list(column_names) |
| 36 | + |
| 37 | + |
| 38 | +def get_column_types(dataset, count=100): |
| 39 | + #dataset = dataset[:count] |
| 40 | + ret = {} |
| 41 | + for header in dataset.headers: |
| 42 | + if header in meta_features: |
| 43 | + ret[header] = meta_features[header] |
| 44 | + continue |
| 45 | + try: |
| 46 | + value_set = set(dataset[header]) |
| 47 | + except: |
| 48 | + import pdb;pdb.set_trace() |
| 49 | + try: |
| 50 | + [float(f) for f in value_set if f is not ''] |
| 51 | + except: |
| 52 | + ret[header] = IGNORE |
| 53 | + else: |
| 54 | + if len(value_set) > 10: |
| 55 | + ret[header] = CONTINUOUS |
| 56 | + elif len(value_set) > 1: |
| 57 | + ret[header] = DISCRETE |
| 58 | + else: |
| 59 | + ret[header] = IGNORE |
| 60 | + |
| 61 | + return ret |
| 62 | + |
| 63 | +def ordered_yield(data, ordering, default=None): |
| 64 | + for o in ordering: |
| 65 | + yield data.get(o, default) |
| 66 | + return |
| 67 | + |
| 68 | +def results_to_csv(file_name): |
| 69 | + output_name = file_name.partition('.')[0] + '.tab' |
| 70 | + results = load_results(file_name) |
| 71 | + flat = [flatten_dict(row) for row in results] |
| 72 | + column_names = get_column_names(flat) |
| 73 | + tab_results = tablib.Dataset(headers=column_names) |
| 74 | + for row in flat: |
| 75 | + row_list = [] |
| 76 | + for val in ordered_yield(row, column_names, ''): |
| 77 | + row_list.append(val) |
| 78 | + tab_results.append(row_list) |
| 79 | + column_types = get_column_types(tab_results) |
| 80 | + tab_results.insert(0, [c.f_type for c in ordered_yield(column_types, column_names, IGNORE)]) |
| 81 | + tab_results.insert(1, [c.flag for c in ordered_yield(column_types, column_names, IGNORE)]) |
| 82 | + with codecs.open(output_name, 'w', 'utf-8') as output: |
| 83 | + output.write(tab_results.tsv.decode('utf-8')) |
| 84 | + |
| 85 | + |
| 86 | +def parse_args(): |
| 87 | + parser = OptionParser() |
| 88 | + return parser.parse_args() |
| 89 | + |
| 90 | +if __name__ == '__main__': |
| 91 | + opts, args = parse_args() |
| 92 | + file_name = args[0] |
| 93 | + results_to_csv(file_name) |
0 commit comments