diff --git a/+Cluster/CauchyGE.m b/+Cluster/CauchyGE.m new file mode 100644 index 0000000000000000000000000000000000000000..686fb71f2cd0fcf9c855b974d51600cc2e2181ed --- /dev/null +++ b/+Cluster/CauchyGE.m @@ -0,0 +1,52 @@ +function v_pts = CauchyGE(A, k) + s = 3; + g = 1.01; + L = 4; + + max_iter = 100; + + v_init = Cluster.LaplacianEigenmaps(A,k); + + R = v_init'; + J = computeJR(A,R, s); + + n = size(R,2); + for i=1:max_iter + gJ = computeGradJR(A,R, s); + M = R + (1/L)*gJ; + + mt = (eye(n) - (ones(n,n) / n)); + [~,~,V] = svd(M*mt, 'econ'); + + Rn = V'; + Jn = computeJR(A,Rn, s); + +% if ( (i > 1) && (Jn - J) < 0 ) +% v_pts = R'; +% return; +% end + + L = g*L; + R = Rn; + J = Jn; + end + + v_pts = R'; +end + +function J = computeJR(A,R, s) + D = squareform(pdist(R.')); + J = sum(A(:) ./ (D(:) + s^2)); +end + +function gJ = computeGradJR(A,R, s) + gJ = zeros(size(R)); + for i=1:size(R,2) + dd = R(:,i) - R; + nsq = sum(dd.^2, 1); + + wD = A(i,:) ./ ((nsq + s^2).^2); + + gJ(:,i) = -2*sum(wD.*dd, 2); + end +end diff --git a/+Cluster/gap_ge.m b/+Cluster/gap_ge.m new file mode 100644 index 0000000000000000000000000000000000000000..cfd4af03791207c9c0141bd3315ed014c40f0479 --- /dev/null +++ b/+Cluster/gap_ge.m @@ -0,0 +1,62 @@ +function [k,idx] = gap_ge(A, kmax, ge_method) + B = 50; + + W = zeros(1,kmax); + W_B = zeros(B,kmax); + + idx_k = zeros(size(A,1),kmax); + + gap = zeros(1,kmax); + s = zeros(1,kmax); + for chk_k=1:kmax + [W(chk_k),W_B(:,chk_k), idx_k(:,chk_k)] = cluster_k(A, B,chk_k, ge_method); + + gap(chk_k) = 1/B*sum(log(W_B(:,chk_k))) - log(W(chk_k)); + s(chk_k) = std(W_B(:,chk_k)) * sqrt(1+1/B); + + if ( chk_k > 1 && ( gap(chk_k-1) >= gap(chk_k) - s(chk_k) )) + k = chk_k-1; + idx = idx_k(:,k); + return; + end + end + + %% Didn't find a satisfactory k + k = chk_k; + idx = idx_k(:,end); +end + +function [Wk,W_Bk, idx_k] = cluster_k(A, B, k, ge_method) + %% Cluster using graph-embedding algorithm/kmeans + v_pts = ge_method(A,k); + [Wk,idx_k] = eval_kmeans(v_pts,k); + + %% Cheat by creating new random data per-k (don't bother with svd since already in eignespace) + d_min = min(v_pts,[],1); + d_max = max(v_pts,[],1); + rnd_data = (d_max-d_min).*rand([size(v_pts), B]) + d_min; + + %% + W_Bk = zeros(B,1); + for i=1:B + W_Bk(i) = eval_kmeans(rnd_data(:,:,i), k); + end +end + +function [Wk,idx_k] = eval_kmeans(X,k) + num_reps = 15; + + idx_k = ones(size(X,1),1); + if ( k > 1 ) + idx_k = kmeans(X,k, 'emptyaction','singleton', 'replicates',num_reps, 'maxiter',200); + end + + %% Just use intracluster dispersion + Wk = 0; + for i=1:k + bK = idx_k==k; + Xk = X(bK,:); + + Wk = Wk + 0.5*sum(var(Xk)); + end +end diff --git a/reddit_run_partial.m b/reddit_run_partial.m index 3d2608a92f748f89108b83d597ac6f0d03549a84..ba99957dd713f1554c39f943cbbf3b69bc3a65d6 100644 --- a/reddit_run_partial.m +++ b/reddit_run_partial.m @@ -9,14 +9,81 @@ end load(chkpt_list(end).name); +%% Drop all but the largest connected component of G +bins = conncomp(G); +bincounts = arrayfun(@(x)(nnz(bins==x)) ,1:max(bins)); + + +[~,binidx] = max(bincounts); +subnodes = find(bins == binidx); + +% Keep track of original edgeIDs +G.Edges.OrgIDs = (1:G.numedges).'; + +subG = subgraph(G, subnodes); + %% Plot the graph to see what it currently looks like -plot(G); +plot(subG); %% Create a weighted adjancency matrix to -nn = numnodes(G); -[s,t] = findedge(G); -A = sparse(s,t,G.Edges.Weight,nn,nn); +% Normalize edge weights +G.Edges.Weight = G.Edges.Weight ./ max(G.Edges.Weight); + +nn = numnodes(subG); +[s,t] = findedge(subG); +A = sparse(s,t,subG.Edges.Weight,nn,nn); A = max(A,A'); %% Clustering using an arbitrary k (Laplacian Eigenmaps) -idx = Cluster.kmeans_le(A, 10); +[k,idx] = Cluster.gap_ge(A, 20, @Cluster.LaplacianEigenmaps); + +%% Get subreddits/titles per-group +[s,t] = findedge(subG); +E = sparse(s,t,subG.Edges.OrgIDs,nn,nn); +E = max(E,E'); + +subreddits = cell(k,1); +titles = cell(k,1); +for i=1:k + bCluster = (idx == i); + cE = E(bCluster,:); + cE = cE(:,bCluster); + + cEdgeIDs = cE(cE>0); + linkIdxs = find(any(S(cEdgeIDs,:),1)); + subsList = {links(linkIdxs).subreddit}.'; + + subreddits{i} = unique(subsList); + titles{i} = {}; + for j=1:length(linkIdxs) + linkID = links(linkIdxs(j)).id; + titles{i} = [titles{i}; {link_map(linkID)}]; + end +end + +%% Get listings of common and unique subreddits per cluster +common_subs = cell(k,k); +unique_subs = cell(k,1); +for i=1:k + bCom = false(k,length(subreddits{i})); + for j=1:k + if (j==i) + continue; + end + + bCom(j,:) = ismember(subreddits{i},subreddits{j}).'; + common_subs{i,j} = subreddits{i}(bCom(j,:)); + end + bUnique = all(bCom == 0, 1); + unique_subs{i} = subreddits{i}(bUnique); +end + + +%% Draw graph with nodes colored by cluster index +cmap = lines(k); +H = plot(subG); +for i=1:k + nodeIDs = find(idx==i); + highlight(H, nodeIDs, 'NodeColor',cmap(i,:)); +end +