Merge pull request #29 from leibatt/leilani

Leilani: integrate Zgraggen et al. insight examples.
leibatt · Dec 8, 2022 · 81e7e8c · 81e7e8c
2 parents 9a8531c + dfd37b3
commit 81e7e8c
Show file tree

Hide file tree

Showing 18 changed files with 1,120 additions and 3 deletions.
diff --git a/datasets/sleep_generated.json b/datasets/sleep_generated.json
diff --git a/examples/relationships/linearRegressionExample.ts b/examples/relationships/linearRegressionExample.ts
@@ -11,7 +11,7 @@ console.log("first row of cars dataset:",cars.records[0]);
 // linear regression relationship model object, and specify which data
 // attributes are involved in the relationship:
 const dtrm: pyxis.LinearRegressionRelationshipModel = new pyxis.LinearRegressionRelationshipModel(
-  "cars", // give the dataset a name, we can just call it cars
+  "cars_lrrm", // give the model a name
   [ // input attributes, the attributes used to predict a certain outcome
     {
       name: "Weight_in_lbs", // Attribute name from the cars dataset

diff --git a/examples/tsconfig.json b/examples/tsconfig.json
@@ -31,7 +31,8 @@
     "use_cases/amar.ts",
     "use_cases/north.ts",
     "use_cases/mathisen.ts",
-    "use_cases/battle/battle.ts"
+    "use_cases/battle/battle.ts",
+    "use_cases/zgraggen/zgraggen.ts"
   ],
   "include": ["../src/**/*.ts", "../types/*.d.ts"]
 }
diff --git a/examples/use_cases/.gitignore b/examples/use_cases/.gitignore
@@ -0,0 +1 @@
+macau
diff --git a/examples/use_cases/zgraggen/.gitignore b/examples/use_cases/zgraggen/.gitignore
@@ -0,0 +1 @@
+env
diff --git a/examples/use_cases/zgraggen/README.md b/examples/use_cases/zgraggen/README.md
@@ -0,0 +1,28 @@
+# Recreating Zgraggen et al. Insights
+
+This folder contains recreations of insights from the Zgraggen multiple comparisons paper:
+* Zgraggen, E., Zhao, Z., Zeleznik, R. and Kraska, T., 2018, April.
+ Investigating the effect of the multiple comparisons problem in visual
+ analysis. In Proceedings of the 2018 chi conference on human factors in
+ computing systems (pp. 1-12).
+
+The insight objects are represented in the (near) original JSON format in
+`zgraggen_insight_examples.json`. The file `mapping.ts` contains our code for
+mapping the specification into corresponding Pyxis objects. The file
+`zgraggen.ts` shows the results of mapping the examples from the original JSON
+format to Pyxis objects.
+
+We had to create our own sleep dataset to recreate these insights. Our data is
+available in `sleep_sample.csv`. We used the code referenced in Zgraggen et al.
+(the Macau project, see `data_generator.py` from
+[this repository](https://github.com/zheguang/macau/blob/master/data_generator.py))
+to generate a larger dataset file with 300 rows, as was done in the original
+experiment. The synthetic dataset is located in `sleep_generated.csv`.
+
+We also tweaked the insight specification language proposed by Zgraggen et al.
+to represent executable JavaScript logic. Specifically, Zgraggen et al. express
+filter as between predicates (similar to SQL) where the lefthand side was not
+written to be executable.  For example, we had to change "75 < age >= 55",
+which does not make sense in terms of code execution, into "75 > age && age >=
+55".  Notice how the lefthand comparator is flipped and we split the
+filter into two predicates joined with a logical and operator "&&".
diff --git a/examples/use_cases/zgraggen/data_generator.py b/examples/use_cases/zgraggen/data_generator.py
@@ -0,0 +1,342 @@
+#!/usr/bin/env python
+'''
+data_generator.py generates data with embedded correlations.
+Copyright (C) 2017  Zheguang Zhao <[email protected]>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+'''
+import sys
+import math
+import numpy
+import random
+import re
+
+
+class CovarianceMatrix:
+    def __init__(self, table=None, attrs=[], entries=[]):
+        self.attrs = attrs
+        self.entries = entries
+        self.table = table
+
+    def cov(self, attr_a, attr_b):
+        entry_a = self.attrs.index(attr_a)
+        entry_b = self.attrs.index(attr_b)
+        return self.entries[entry_a][entry_b]
+
+    def set_cov(self, attr_a, attr_b, covariance):
+        entry_a = self.attrs.index(attr_a)
+        entry_b = self.attrs.index(attr_b)
+        self.entries[entry_a][entry_b] = covariance
+        self.entries[entry_b][entry_a] = covariance
+
+    def sample(self, attr_synopses={}, n_samples=1):
+        assert self.entries != []
+        attr_synopses = list(map(lambda x: attr_synopses[x], self.attrs))
+
+        means = list(map(lambda x: x.mu, attr_synopses))
+
+        # sample from multivariate normal until enough samples fall within the empirical domain
+        nonnull_samples = []
+        while True:
+            samples = numpy.random.multivariate_normal(means, self.entries, n_samples).tolist()
+
+            columns = transpose(samples)
+            attr_columns = zip(attr_synopses, columns)
+
+            attr_discretized_columns = map(lambda x: (x[0], x[0].discretize(x[1]) if isinstance(x[0], DiscreteAttrSynopsis) else x[1]), attr_columns)
+            attr_domainized_columns = map(lambda x: (x[0], x[0].domainize(x[1])), attr_discretized_columns)
+
+            domainized_rows = transpose(list(map(lambda x: list(x[1]), attr_domainized_columns)))
+            nonnull_rows = filter(lambda x: None not in x, domainized_rows)
+            nonnull_samples += nonnull_rows
+            if len(nonnull_samples) >= n_samples:
+                nonnull_samples = nonnull_samples[:n_samples]
+                break
+            else:
+                continue
+
+        assert len(nonnull_samples) == n_samples
+        nonnull_columns = transpose(nonnull_samples)
+        attr_nonnull_columns = zip(attr_synopses, nonnull_columns)
+
+        attr_categorized_columns = map(lambda x: (x[0], x[0].categorize(x[1]) if type(x[0]) is OrdinalAttrSynopsis else x[1]), attr_nonnull_columns)
+        attr_stringified_columns = map(lambda x: (x[0], x[0].stringify(x[1])), attr_categorized_columns)
+
+        formatted_columns = list(map(lambda x: list(x[1]), attr_stringified_columns))
+        assert len(formatted_columns) == len(self.attrs), '{},{},{},{}'.format(self.attrs, self.entries, len(columns), len(attr_synopses))
+
+        if len(set(self.attrs)) == 1:
+            formatted_columns = [formatted_columns[0]]
+        else:
+            assert len(set(self.attrs)) == len(self.attrs), 'should be distinct joint normal'
+
+        return formatted_columns
+
+    def debug(self):
+        return '{attrs}\n{entries}'.format(attrs=self.attrs, entries=numpy.array(self.entries))
+
+    @staticmethod
+    def fromTableAttrs(table, attrs=[], is_correlated=True):
+        cov_mat = CovarianceMatrix(table=table, attrs=attrs, entries=numpy.zeros((len(attrs), len(attrs))).tolist())
+        pairs = [(x, y) for x in attrs for y in attrs if x <= y] # unique pair of attrs
+        for p in pairs:
+            if p[0] == p[1]:
+                rho = 1
+            elif p[0] == 'id:INTEGER' or p[1] == 'id:INTEGER':
+                raise RuntimeError('id attribute not supported')
+            else:
+                if is_correlated:
+                    while True:
+                        rho = random.choice([-1,1]) * random.random()
+                        if rho == 0:
+                            continue
+                        else:
+                            break
+                else:
+                    rho = 0
+            cov_mat.set_cov(p[0], p[1], rho * table.attr_synopses[p[0]].sigma * table.attr_synopses[p[1]].sigma)
+        return cov_mat
+
+
+class AttrSynopsis(object):
+    def __init__(self, attr='', domain=(-float("inf"), float("inf")), mu=0.0, sigma=1.0, shift=0):
+        self.attr = attr
+        self.domain = domain
+        self.mu = mu
+        self.sigma = sigma
+        self.shift = shift
+
+    def domainize(self, xs):
+        return map(lambda x: x if self.domain[0] <= x and x <= self.domain[1] else None, xs)
+
+    def stringify(self, xs):
+        return map(lambda x: '' if x == None else str(x), xs)
+
+    def debug(self):
+        return 'type={type},attr={attr},domain={domain},mu={mu},sigma={sigma}'.format(type=type(self), attr=self.attr, domain=self.domain, mu=self.mu, sigma=self.sigma)
+
+    @staticmethod
+    def fromAttrColumn(attr, column, set_category_order, source_attr_synopsis=None):
+        column = list(filter(lambda x: x != '', column))
+
+        def dist_params(xs):
+            mu = math.fsum(xs) / float(len(xs))
+            sigma = math.sqrt(math.fsum(map(lambda x: pow((x - mu), 2), xs)) / float(len(xs)))
+            domain = (min(xs), max(xs))
+            return (mu, sigma, domain)
+
+        if 'STRING' in attr:
+            if source_attr_synopsis == None:
+                category_order = list(set(column))
+                random.shuffle(category_order)
+
+                if set_category_order != None:
+                    category_order = set_category_order
+            else:
+                category_order = source_attr_synopsis.category_order
+
+            ordinalized_col = list(map(lambda x: category_order.index(x), column))
+            (mu, sigma, _) = dist_params(ordinalized_col)
+            domain = (0, len(category_order) - 1)
+            return OrdinalAttrSynopsis(attr, domain=domain, mu=mu, sigma=sigma, category_order=category_order)
+        else:
+            num_column = list(map(float, column))
+            (mu, sigma, domain) = dist_params(num_column)
+
+            if 'FLOAT' in attr:
+                return ContinuousAttrSynopsis(attr, domain=domain, mu=mu, sigma=sigma)
+            elif 'INTEGER' in attr:
+                return DiscreteAttrSynopsis(attr, domain=domain, mu=mu, sigma=sigma)
+            else:
+                raise RuntimeError('unsupported type: {}'.format(attr))
+
+    @staticmethod
+    def fromAttr(attr, mu, sigma, domain, category_order):
+        assert ':' in attr, 'attr must be typed'
+        if 'STRING' in attr:
+            return OrdinalAttrSynopsis(attr, domain=domain, mu=mu, sigma=sigma, category_order=category_order)
+        else:
+            if 'FLOAT' in attr:
+                return ContinuousAttrSynopsis(attr, domain=domain, mu=mu, sigma=sigma)
+            elif 'INTEGER' in attr:
+                return DiscreteAttrSynopsis(attr, domain=domain, mu=mu, sigma=sigma)
+            else:
+                raise RuntimeError('unsupported type: {}'.format(attr))
+
+
+
+class DiscreteAttrSynopsis(AttrSynopsis):
+    def discretize(self, xs):
+        return numpy.array(list(map(lambda x: int(round(x)), xs)))
+
+
+
+class ContinuousAttrSynopsis(AttrSynopsis):
+    pass
+
+
+class OrdinalAttrSynopsis(DiscreteAttrSynopsis):
+    def __init__(self, attr, domain, mu, sigma, category_order=[]):
+        super().__init__(attr, domain, mu, sigma)
+        self.category_order = category_order
+
+    def debug(self):
+        return '{parent},category_order={category_order}'.format(parent=super().debug(), category_order=self.category_order)
+
+    def categorize(self, xs):
+        return map(lambda x: None if x == None else self.category_order[x], xs)
+
+
+class Table(object):
+    def __init__(self, attrs=[], records=[], attr_synopses={}):
+        self.attrs = attrs
+        self.records = records
+        self.attr_synopses = attr_synopses
+
+    def debug(self):
+        attrs = 'attrs:\n{attrs}'.format(attrs=str(self.attrs))
+        attr_synopses = 'attr_synopses:\n{attr_synopses}'.format(attr_synopses='\n'.join(map(lambda x: x.debug(), self.attr_synopses.values())))
+        return '{attrs}\n{attr_synopses}'.format(attrs=attrs, attr_synopses=attr_synopses)
+
+    @staticmethod
+    def fromFile(fpath, existing_note_file):
+        with open(fpath, 'r') as f:
+            data = list(map(lambda x: x.strip().split(','), f))
+            attrs = data[0]
+            records = data[1:]
+            columns = transpose(records)
+            assert len(columns) == len(attrs)
+
+            attr_synopses = {}
+            for x in zip(attrs, columns):
+                set_category_order = None
+                if existing_note_file != None and 'STRING:TreatAsEnumeration' in x[0]:
+                    note = None
+                    with open(existing_note_file, 'r') as content_file:
+                        note = content_file.read()
+                    #print(re.search(r'' + x[0] + '.*category_order=[(.*)]', note).groups()[0])
+                    print(x[0])
+                    o = re.search(r'.*(' + x[0] + ',domain.*category_order=\[(.*)\])', note).groups()[1]
+                    o = o.split(',')
+                    o = [s.strip().replace('\'', '') for s in o]
+                    set_category_order = o
+
+                attr_synopses[x[0]] =  AttrSynopsis.fromAttrColumn(x[0], x[1], set_category_order)
+            return Table(attrs=attrs, records=records, attr_synopses=attr_synopses)
+
+    def writeToFile(self, fpath):
+        with open(fpath, 'w') as f:
+            f.write('{}\n'.format(','.join(self.attrs)))
+            for record in self.records:
+                f.write('{}\n'.format(','.join(map(str, record))))
+
+
+class SampleTable(Table):
+    def __init__(self, attrs=[], records=[], attr_synopses={}, seed=1, n_variate=2, cov_mats=[], source_table=None):
+        super().__init__(attrs=attrs, records=records, attr_synopses=attr_synopses)
+        self.seed = seed
+        self.n_variate = n_variate
+        self.cov_mats = cov_mats
+        self.source_table = source_table
+
+    def debug(self):
+        return '{parent}\nseed={seed}\nn_variate={n_variate}\ncov_mats:\n{cov_mats}\nsource_table:\n{source_table}'.format(parent=(super().debug()), seed=self.seed, n_variate=self.n_variate, cov_mats='\n'.join(map(lambda x: x.debug(), self.cov_mats)), source_table='None' if self.source_table is None else self.source_table.debug())
+
+    @staticmethod
+    def fromTable(table, seed=1, n_samples=100, correlated_bivariate_ratio=0.0):
+        attrs_perm = [x for x in table.attrs if x != 'id:INTEGER']
+        random.shuffle(attrs_perm)
+        assert 'id:INTEGER' not in attrs_perm
+
+        n_variate = 2
+        variates = list(map(lambda i: [attrs_perm[n_variate * i], attrs_perm[min(n_variate * i + 1, len(attrs_perm) - 1)]], range(math.ceil(len(attrs_perm) / n_variate))))
+
+        n_correlated = math.floor(len(variates) * correlated_bivariate_ratio)
+        is_correlateds = [1] * n_correlated + [0] * (len(variates) - n_correlated)
+        assert len(is_correlateds) == len(variates)
+        cov_mats = list(map(lambda x: CovarianceMatrix.fromTableAttrs(table, attrs=x[0], is_correlated=x[1]), zip(variates, is_correlateds)))
+
+        mv_columns_perm = list(map(lambda x: x.sample(table.attr_synopses, n_samples), cov_mats))
+        columns_perm = []
+        for c in mv_columns_perm:
+            columns_perm += c
+
+        attr_columns_perm = dict(zip(attrs_perm, columns_perm))
+        columns = list(map(lambda x: attr_columns_perm[x] if x != 'id:INTEGER' else list(range(n_samples)), table.attrs))
+        attr_synopses = dict(map(lambda x: (x[0], AttrSynopsis.fromAttrColumn(x[0], x[1], None, table.attr_synopses[x[0]])), zip(table.attrs, columns)))
+        rows = transpose(columns)
+
+        return SampleTable(attrs=table.attrs, records=rows, attr_synopses=attr_synopses, seed=seed, n_variate=n_variate, cov_mats=cov_mats, source_table=table)
+
+    @staticmethod
+    def fromGroundTruth(truth, seed=1, n_samples=100):
+        assert 'id:INTEGER' not in truth.typed_attrs
+
+        n_variate = 2
+        cov_mats = truth.cov_mats
+
+        mv_columns_perm = list(map(lambda x: x.sample(truth.attr_synopses, n_samples), cov_mats))
+
+        attr_columns_perm = {}
+        for (cov_mat, mv_cols) in zip(cov_mats, mv_columns_perm):
+            attr_columns_perm[cov_mat.attrs[0]] = mv_cols[0]
+            attr_columns_perm[cov_mat.attrs[1]] = mv_cols[1]
+
+        columns = list(map(lambda x: attr_columns_perm[x], truth.typed_attrs))
+        attr_synopses = dict(map(lambda x: (x[0], AttrSynopsis.fromAttrColumn(x[0], x[1], None, truth.attr_synopses[x[0]])), zip(truth.typed_attrs, columns)))
+        rows = transpose(columns)
+
+        return SampleTable(attrs=truth.typed_attrs, records=rows, attr_synopses=attr_synopses, seed=seed, n_variate=n_variate, cov_mats=cov_mats, source_table=truth)
+
+
+    def writeToFile(self, fpath):
+        super().writeToFile(fpath)
+
+        with open('{}.note'.format(fpath), 'w') as f:
+            f.write(self.debug())
+
+
+def transpose(rows):
+    return [list(map(lambda x: x[i], rows)) for i in range(0, len(rows[0]))]
+
+
+def main(argv):
+    if argv[1] == '-h':
+        print('usage: data_generator.py input.csv output.csv seed n_samples correlated_bivariate_ratio')
+    else:
+        fpath = argv[1]
+        opath = argv[2]
+        seed = int(argv[3])
+        n_samples = int(argv[4])
+        correlated_bivariate_ratio = float(argv[5])
+
+        existing_note_file = None
+        if len(argv) == 7:
+            existing_note_file = argv[6]
+
+        random.seed(seed)
+        numpy.random.seed(seed)
+
+        table = Table.fromFile(fpath, existing_note_file)
+        #print(table.debug())
+        #numpy.set_printoptions(precision=3, suppress=True)
+
+        sample_table = SampleTable.fromTable(table, seed=seed, n_samples=n_samples, correlated_bivariate_ratio=correlated_bivariate_ratio)
+        #print(sample_table.debug())
+        sample_table.writeToFile(opath)
+
+
+if __name__ == '__main__':
+    main(sys.argv)
+