Attempt at Cauchy graph embedding and cap statistic

dcf12420 · Mark Winter · f36d5d39 · dcf12420 · dcf12420 · dcf12420
Commit dcf12420 authored Jun 1, 2020 by Mark Winter
--- a/+Cluster/CauchyGE.m
+++ b/+Cluster/CauchyGE.m
+function v_pts = CauchyGE(A, k)
+    s = 3;
+    g = 1.01;
+    L = 4;
+    max_iter = 100;
+    v_init = Cluster.LaplacianEigenmaps(A,k);
+    R = v_init';
+    J = computeJR(A,R, s);
+    n = size(R,2);
+    for i=1:max_iter
+        gJ = computeGradJR(A,R, s);
+        M = R + (1/L)*gJ;
+        mt = (eye(n) - (ones(n,n) / n));
+        [~,~,V] = svd(M*mt, 'econ');
+        Rn = V';
+        Jn = computeJR(A,Rn, s);
+%         if ( (i > 1) && (Jn - J) < 0 )
+%             v_pts = R';
+%             return;
+%         end
+        L = g*L;
+        R = Rn;
+        J = Jn;
+    end
+    v_pts = R';
+end
+function J = computeJR(A,R, s)
+    D = squareform(pdist(R.'));
+    J = sum(A(:) ./ (D(:) + s^2));
+end
+function gJ = computeGradJR(A,R, s)
+    gJ = zeros(size(R));
+    for i=1:size(R,2)
+        dd = R(:,i) - R;
+        nsq = sum(dd.^2, 1);
+        wD = A(i,:) ./ ((nsq + s^2).^2);
+        gJ(:,i) = -2*sum(wD.*dd, 2);
+    end
+end
--- a/+Cluster/gap_ge.m
+++ b/+Cluster/gap_ge.m
+function [k,idx] = gap_ge(A, kmax, ge_method)
+    B = 50;
+    W = zeros(1,kmax);
+    W_B = zeros(B,kmax);
+    idx_k = zeros(size(A,1),kmax);
+    gap = zeros(1,kmax);
+    s = zeros(1,kmax);
+    for chk_k=1:kmax
+        [W(chk_k),W_B(:,chk_k), idx_k(:,chk_k)] = cluster_k(A, B,chk_k, ge_method);
+        gap(chk_k) = 1/B*sum(log(W_B(:,chk_k))) - log(W(chk_k));
+        s(chk_k) = std(W_B(:,chk_k)) * sqrt(1+1/B);
+        if ( chk_k > 1 && ( gap(chk_k-1) >= gap(chk_k) - s(chk_k) ))
+            k = chk_k-1;
+            idx = idx_k(:,k);
+            return;
+        end
+    end
+    %% Didn't find a satisfactory k
+    k = chk_k;
+    idx = idx_k(:,end);
+end
+function [Wk,W_Bk, idx_k] = cluster_k(A, B, k, ge_method)
+    %% Cluster using graph-embedding algorithm/kmeans
+    v_pts = ge_method(A,k);
+    [Wk,idx_k] = eval_kmeans(v_pts,k);
+    %% Cheat by creating new random data per-k (don't bother with svd since already in eignespace)
+    d_min = min(v_pts,[],1);
+    d_max = max(v_pts,[],1);
+    rnd_data = (d_max-d_min).*rand([size(v_pts), B]) + d_min;
+    %% 
+    W_Bk = zeros(B,1);
+    for i=1:B
+        W_Bk(i) = eval_kmeans(rnd_data(:,:,i), k);
+    end
+end
+function [Wk,idx_k] = eval_kmeans(X,k)
+    num_reps = 15;
+    idx_k = ones(size(X,1),1);
+    if ( k > 1 )
+        idx_k = kmeans(X,k, 'emptyaction','singleton', 'replicates',num_reps, 'maxiter',200);
+    end
+    %% Just use intracluster dispersion
+    Wk = 0;
+    for i=1:k
+        bK = idx_k==k;
+        Xk = X(bK,:);
+        Wk = Wk + 0.5*sum(var(Xk));
+    end
+end
--- a/reddit_run_partial.m
+++ b/reddit_run_partial.m
@@ -9,14 +9,81 @@ end
 load(chkpt_list(end).name);
+%% Drop all but the largest connected component of G
+bins = conncomp(G);
+bincounts = arrayfun(@(x)(nnz(bins==x)) ,1:max(bins));
+[~,binidx] = max(bincounts);
+subnodes = find(bins == binidx);
+% Keep track of original edgeIDs
+G.Edges.OrgIDs = (1:G.numedges).';
+subG = subgraph(G, subnodes);
 %% Plot the graph to see what it currently looks like
-plot(G);
+plot(subG);
 %% Create a weighted adjancency matrix to 
-nn = numnodes(G);
+% Normalize edge weights
-[s,t] = findedge(G);
+G.Edges.Weight = G.Edges.Weight ./ max(G.Edges.Weight);
-A = sparse(s,t,G.Edges.Weight,nn,nn);
+nn = numnodes(subG);
+[s,t] = findedge(subG);
+A = sparse(s,t,subG.Edges.Weight,nn,nn);
 A = max(A,A');
 %% Clustering using an arbitrary k (Laplacian Eigenmaps)
-idx = Cluster.kmeans_le(A, 10);
+[k,idx] = Cluster.gap_ge(A, 20, @Cluster.LaplacianEigenmaps);
+%% Get subreddits/titles per-group
+[s,t] = findedge(subG);
+E = sparse(s,t,subG.Edges.OrgIDs,nn,nn);
+E = max(E,E');
+subreddits = cell(k,1);
+titles = cell(k,1);
+for i=1:k
+    bCluster = (idx == i);
+    cE = E(bCluster,:);
+    cE = cE(:,bCluster);
+    cEdgeIDs = cE(cE>0);
+    linkIdxs = find(any(S(cEdgeIDs,:),1));
+    subsList = {links(linkIdxs).subreddit}.';
+    subreddits{i} = unique(subsList);
+    titles{i} = {};
+    for j=1:length(linkIdxs)
+        linkID = links(linkIdxs(j)).id;
+        titles{i} = [titles{i}; {link_map(linkID)}];
+    end
+end
+%% Get listings of common and unique subreddits per cluster
+common_subs = cell(k,k);
+unique_subs = cell(k,1);
+for i=1:k
+    bCom = false(k,length(subreddits{i}));
+    for j=1:k
+        if (j==i)
+            continue;
+        end
+        bCom(j,:) = ismember(subreddits{i},subreddits{j}).';
+        common_subs{i,j} = subreddits{i}(bCom(j,:));
+    end
+    bUnique = all(bCom == 0, 1);
+    unique_subs{i} = subreddits{i}(bUnique);
+end
+%% Draw graph with nodes colored by cluster index
+cmap = lines(k);
+H = plot(subG);
+for i=1:k
+    nodeIDs = find(idx==i);
+    highlight(H, nodeIDs, 'NodeColor',cmap(i,:));
+end