CPSSD · iandioch · Feb 13, 2019 · Feb 12, 2019 · Feb 12, 2019 · Feb 12, 2019
diff --git a/research/common_neighbours_football.ipynb b/research/common_neighbours_football.ipynb
@@ -0,0 +1,302 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 195,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "with open(\"research/datasets/football_key.tsv\") as f:\n",
+    "    # Each line is of form: <country_id> <country_name>\n",
+    "    def fmt(line):\n",
+    "        return (int(line[0])-1, line[1].strip('\"'))\n",
+    "    data_key = [fmt(line.strip().split()) for line in f if line[0] != '*']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 196,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "with open(\"research/datasets/football_pairs.tsv\") as f:\n",
+    "    # Each line is of form: <country_a_id> <country_b_id> <number_of_players>\n",
+    "    def fmt(pair):\n",
+    "        return (int(pair[0])-1, int(pair[1])-1, 1)\n",
+    "    data_pairs = [fmt(line.strip().split()) for line in f if line[0] != '*']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Turn into useful format\n",
+    "\n",
+    "Edit the `neighbours[]` below and `similarity` func below that to create a new metric."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 197,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "neighbours = [set() for _ in range(len(data_key))]\n",
+    "for p in data_pairs:\n",
+    "    neighbours[p[0]].add(p[1])\n",
+    "    neighbours[p[1]].add(p[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 198,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def similarity_CN(x, y):\n",
+    "    # Common neighbours\n",
+    "    # x, y are indices to neighbours[]\n",
+    "    return len(neighbours[x] & neighbours[y])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute similarities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 199,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# S_CN[x][y] contains the similarity of nodes x and y using the Common Neighbours (CN) metric.\n",
+    "S_CN = [[0 for _ in range(len(data_key))] for _ in range(len(data_key))]\n",
+    "for i in range(len(data_key)-1):\n",
+    "    for j in range(0, len(data_key)):\n",
+    "        S_CN[i][j] = similarity_CN(i, j)\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 200,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "    ARG AUT BEL BGR BRA CHE CHL CMR COL DEU DNK ESP FRA GBR GRE HRV IRN\n",
+      "ARG   4,  2,  1,  1,  3,  0,  1,  2,  2,  2,  2,  2,  1,  1,  0,  2,  0\n",
+      "AUT   2,  8,  3,  2,  3,  1,  1,  4,  2,  6,  4,  5,  4,  3,  1,  4,  1\n",
+      "BEL   1,  3,  6,  1,  2,  1,  1,  3,  1,  5,  3,  5,  3,  3,  1,  2,  1\n",
+      "BGR   1,  2,  1,  4,  2,  0,  0,  4,  1,  1,  3,  1,  1,  0,  0,  3,  1\n",
+      "BRA   3,  3,  2,  2,  7,  0,  1,  5,  2,  3,  2,  3,  1,  1,  0,  2,  0\n",
+      "CHE   0,  1,  1,  0,  0,  2,  0,  0,  0,  2,  0,  2,  2,  1,  0,  0,  0\n",
+      "CHL   1,  1,  1,  0,  1,  0,  3,  1,  3,  2,  1,  2,  1,  2,  0,  1,  0\n",
+      "CMR   2,  4,  3,  4,  5,  0,  1,  9,  2,  4,  4,  3,  3,  2,  0,  5,  1\n",
+      "COL   2,  2,  1,  1,  2,  0,  3,  2,  5,  3,  2,  3,  2,  2,  0,  2,  0\n",
+      "DEU   2,  6,  5,  1,  3,  2,  2,  4,  3, 19,  3, 13,  9, 10,  3,  3,  0\n",
+      "DNK   2,  4,  3,  3,  2,  0,  1,  4,  2,  3,  7,  3,  3,  3,  0,  5,  1\n",
+      "ESP   2,  5,  5,  1,  3,  2,  2,  3,  3, 13,  3, 18,  9,  9,  3,  3,  1\n",
+      "FRA   1,  4,  3,  1,  1,  2,  1,  3,  2,  9,  3,  9, 13,  5,  1,  3,  1\n",
+      "GBR   1,  3,  3,  0,  1,  1,  2,  2,  2, 10,  3,  9,  5, 12,  2,  2,  0\n",
+      "GRE   0,  1,  1,  0,  0,  0,  0,  0,  0,  3,  0,  3,  1,  2,  3,  0,  0\n",
+      "HRV   2,  4,  2,  3,  2,  0,  1,  5,  2,  3,  5,  3,  3,  2,  0,  6,  1\n",
+      "IRN   0,  1,  1,  1,  0,  0,  0,  1,  0,  0,  1,  1,  1,  0,  0,  1,  1\n"
+     ]
+    }
+   ],
+   "source": [
+    "# A quick eyeball check of a subset of the data.\n",
+    "num_to_print = len(data_key)//2\n",
+    "print(' '*4 + ' '.join(d[1] for d in data_key[:num_to_print]))\n",
+    "print('\\n'.join(data_key[i][1] + ' ' + ','.join('{:>3}'.format(c) for c in S_CN[i][:num_to_print]) for i in range(num_to_print)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create test sets.\n",
+    "\n",
+    "Split the list of links into 10 random partitions, as the paper does, to get comparable measurements. Also create a set of all links which are not in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "len(e) 118\n",
+      "len(e)//10 = 11\n",
+      "10 subsets:\n",
+      "[(4, 19), (24, 28), (5, 23), (19, 20), (6, 17), (21, 26), (13, 25), (10, 17), (17, 24), (7, 19), (21, 30), (28, 31)]\n",
+      "[(11, 21), (9, 33), (23, 32), (14, 25), (15, 17), (9, 21), (11, 33), (17, 34), (12, 17), (4, 27), (1, 7), (11, 28)]\n",
+      "[(15, 31), (9, 16), (2, 24), (4, 26), (24, 34), (11, 27), (13, 18), (13, 15), (9, 30), (31, 34), (11, 23), (7, 17)]\n",
+      "[(10, 11), (3, 31), (10, 24), (7, 9), (17, 33), (12, 21), (25, 29), (22, 27), (1, 12), (2, 23), (11, 24), (7, 26)]\n",
+      "[(1, 34), (23, 24), (0, 17), (8, 11), (23, 34), (9, 15), (9, 11), (3, 9), (12, 34), (19, 33), (13, 24), (10, 29)]\n",
+      "[(12, 29), (1, 17), (9, 23), (9, 10), (12, 33), (7, 12), (9, 12), (13, 29), (11, 34), (4, 8), (8, 32), (0, 6)]\n",
+      "[(2, 17), (1, 9), (17, 21), (13, 34), (9, 28), (11, 15), (17, 23), (9, 17), (13, 33), (6, 32), (13, 17), (9, 34)]\n",
+      "[(4, 17), (4, 12), (1, 13), (24, 32), (2, 9), (11, 17), (14, 28), (12, 23), (8, 17), (9, 24), (3, 11), (13, 28)]\n",
+      "[(9, 25), (7, 14), (17, 25), (12, 20), (10, 13), (4, 11), (12, 30), (13, 32), (9, 32), (7, 31), (0, 8), (1, 11)]\n",
+      "[(23, 31), (10, 31), (11, 25), (5, 34), (3, 26), (7, 11), (2, 12), (0, 11), (1, 15), (2, 28)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def chunks(l, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from l.\"\"\"\n",
+    "    for it in range(0, len(l), n):\n",
+    "        yield l[it:it + n]\n",
+    "        \n",
+    "e = []\n",
+    "predict = []\n",
+    "for i in range(len(data_key)):\n",
+    "    for j in range(i+1, len(data_key)):\n",
+    "        if i in neighbours[j]:\n",
+    "            e.append((i, j))\n",
+    "        else:\n",
+    "            predict.append((i, j))\n",
+    "            \n",
+    "# e now contains all link pairs\n",
+    "# predict contains all non-existing links from the original data\n",
+    "# each pair is a tuple (a, b), where a < b\n",
+    "\n",
+    "# We now randomly shuffle this list\n",
+    "import random\n",
+    "random.shuffle(e)\n",
+    "\n",
+    "print('len(e)', len(e))\n",
+    "print('len(e)//10 =', len(e)//10)\n",
+    "\n",
+    "# Create e_prime, a list of 10 partitions\n",
+    "e_prime = []\n",
+    "for _ in range(10):\n",
+    "    e_prime.append(list(chunks(e, len(e)//10 + 1)))\n",
+    "\n",
+    "# TODO(iandioch): Figure out why the following line is necessary?\n",
+    "e_prime = e_prime[0]\n",
+    "\n",
+    "# The following is a quick eyeball test to make sure the partitions look ok.\n",
+    "print('10 subsets:')\n",
+    "for i in range(len(e_prime)):\n",
+    "    entry = e_prime[i]\n",
+    "    print(entry)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 202,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\t\tn1   \tn2   \tn3   \tAUC\n",
+      "Fold 1 :\t2222 \t1311 \t2191 \t0.502708\n",
+      "Fold 2 :\t3893 \t749  \t1082 \t0.745545\n",
+      "Fold 3 :\t2612 \t1193 \t1919 \t0.560535\n",
+      "Fold 4 :\t3450 \t885  \t1389 \t0.680031\n",
+      "Fold 5 :\t4171 \t631  \t922  \t0.783805\n",
+      "Fold 6 :\t3645 \t813  \t1266 \t0.707809\n",
+      "Fold 7 :\t4413 \t521  \t790  \t0.816474\n",
+      "Fold 8 :\t3796 \t772  \t1156 \t0.730608\n",
+      "Fold 9 :\t2887 \t1100 \t1737 \t0.600454\n",
+      "Fold 10:\t2364 \t940  \t1466 \t0.594130\n",
+      "Average:\t3345.3\t891.5\t1391.8\t0.672210\n"
+     ]
+    }
+   ],
+   "source": [
+    "aucs = []\n",
+    "n1s = []\n",
+    "n2s = []\n",
+    "n3s = []\n",
+    "ns = []\n",
+    "\n",
+    "# Column headings.\n",
+    "print('\\t\\tn1   \\tn2   \\tn3   \\tAUC')\n",
+    "\n",
+    "# Iterate across the 10 folds.\n",
+    "for i in range(10):\n",
+    "    test = e_prime[i]\n",
+    "    \n",
+    "    n1 = 0 # missing_link > nonexistant_link\n",
+    "    n2 = 0 # missing_link = nonexistant_link\n",
+    "    n3 = 0 # missing_link < nonexistant_link\n",
+    "    n = 0 # total link comparisons\n",
+    "    for missing_link in test:\n",
+    "        a_score = S_CN[missing_link[0]][missing_link[1]]\n",
+    "        for nonexistant_link in predict:\n",
+    "            b_score = S_CN[nonexistant_link[0]][nonexistant_link[1]]\n",
+    "            if abs(a_score-b_score) < 0.0005:\n",
+    "                n2 += 1\n",
+    "            elif a_score > b_score:\n",
+    "                n1 += 1\n",
+    "            else:\n",
+    "                n3 += 1\n",
+    "            n += 1\n",
+    "    auc = (n1 + 0.5*n2)/(n)\n",
+    "    aucs.append(auc)\n",
+    "    n1s.append(n1)\n",
+    "    n2s.append(n2)\n",
+    "    n3s.append(n3)\n",
+    "    ns.append(n)\n",
+    "    print('Fold {:<2}:\\t{:<5}\\t{:<5}\\t{:<5}\\t{:<.6f}'.format(i+1, n1, n2, n3, auc))\n",
+    "\n",
+    "def avg(seq):\n",
+    "    return sum(seq)/len(seq)\n",
+    "\n",
+    "print('Average:\\t{:<5}\\t{:<5}\\t{:<5}\\t{:<.6f}'.format(avg(n1s), avg(n2s), avg(n3s), avg(aucs)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/research/datasets/README.md b/research/datasets/README.md
@@ -0,0 +1,8 @@
+# Datasets
+
+## World Soccer Data Paris 1998
+
+Part of the 'Pajek datasets' collection. Retrieved from [here](http://vlado.fmf.uni-lj.si/pub/networks/data/sport/football.htm).
+
+The [downloaded file](http://vlado.fmf.uni-lj.si/pub/networks/data/sport/football.net) was split into two files, `football_key.tsv` and `football_pairs.tsv`.
+`football_key.tsv` contains the vertices, and `football_pairs.tsv` contains the arcs.
diff --git a/research/datasets/football_key.tsv b/research/datasets/football_key.tsv
@@ -0,0 +1,36 @@
+*Vertices     35
+  1 "ARG"    0.3784 0.7257 0.5
+  2 "AUT"    0.7070 0.3091 0.5
+  3 "BEL"    0.3112 0.3627 0.5
+  4 "BGR"    0.4901 0.2231 0.5
+  5 "BRA"    0.4183 0.8734 0.5
+  6 "CHE"    0.8277 0.7710 0.5
+  7 "CHL"    0.0570 0.4660 0.5
+  8 "CMR"    0.6661 0.4183 0.5
+  9 "COL"    0.1708 0.7309 0.5
+ 10 "DEU"    0.6178 0.2022 0.5
+ 11 "DNK"    0.7995 0.3587 0.5
+ 12 "ESP"    0.5580 0.5982 0.5
+ 13 "FRA"    0.6993 0.2113 0.5
+ 14 "GBR"    0.8284 0.4099 0.5
+ 15 "GRE"    0.8529 0.5632 0.5
+ 16 "HRV"    0.6570 0.7437 0.5
+ 17 "IRN"    0.6526 0.0804 0.5
+ 18 "ITA"    0.4556 0.6498 0.5
+ 19 "JAM"    0.8974 0.5263 0.5
+ 20 "JPN"    0.7505 0.9353 0.5
+ 21 "KOR"    0.9060 0.8003 0.5
+ 22 "MAR"    0.4052 0.5224 0.5
+ 23 "MEX"    0.5903 0.8579 0.5
+ 24 "NGA"    0.4846 0.5060 0.5
+ 25 "NLD"    0.6541 0.5668 0.5
+ 26 "NOR"    0.7761 0.4672 0.5
+ 27 "PRT"    0.1998 0.4088 0.5
+ 28 "PRY"    0.5335 0.8248 0.5
+ 29 "ROM"    0.6593 0.5079 0.5
+ 30 "SCO"    0.9430 0.4508 0.5
+ 31 "TUN"    0.4882 0.0647 0.5
+ 32 "TUR"    0.5347 0.3531 0.5
+ 33 "USA"    0.3644 0.1830 0.5
+ 34 "YUG"    0.5178 0.7720 0.5
+ 35 "ZAF"    0.7861 0.6645 0.5