Skip to content
Snippets Groups Projects
Commit dcf12420 authored by Mark Winter's avatar Mark Winter
Browse files

Attempt at Cauchy graph embedding and cap statistic

parent f36d5d39
Branches master
No related tags found
No related merge requests found
function v_pts = CauchyGE(A, k)
s = 3;
g = 1.01;
L = 4;
max_iter = 100;
v_init = Cluster.LaplacianEigenmaps(A,k);
R = v_init';
J = computeJR(A,R, s);
n = size(R,2);
for i=1:max_iter
gJ = computeGradJR(A,R, s);
M = R + (1/L)*gJ;
mt = (eye(n) - (ones(n,n) / n));
[~,~,V] = svd(M*mt, 'econ');
Rn = V';
Jn = computeJR(A,Rn, s);
% if ( (i > 1) && (Jn - J) < 0 )
% v_pts = R';
% return;
% end
L = g*L;
R = Rn;
J = Jn;
end
v_pts = R';
end
function J = computeJR(A,R, s)
D = squareform(pdist(R.'));
J = sum(A(:) ./ (D(:) + s^2));
end
function gJ = computeGradJR(A,R, s)
gJ = zeros(size(R));
for i=1:size(R,2)
dd = R(:,i) - R;
nsq = sum(dd.^2, 1);
wD = A(i,:) ./ ((nsq + s^2).^2);
gJ(:,i) = -2*sum(wD.*dd, 2);
end
end
function [k,idx] = gap_ge(A, kmax, ge_method)
B = 50;
W = zeros(1,kmax);
W_B = zeros(B,kmax);
idx_k = zeros(size(A,1),kmax);
gap = zeros(1,kmax);
s = zeros(1,kmax);
for chk_k=1:kmax
[W(chk_k),W_B(:,chk_k), idx_k(:,chk_k)] = cluster_k(A, B,chk_k, ge_method);
gap(chk_k) = 1/B*sum(log(W_B(:,chk_k))) - log(W(chk_k));
s(chk_k) = std(W_B(:,chk_k)) * sqrt(1+1/B);
if ( chk_k > 1 && ( gap(chk_k-1) >= gap(chk_k) - s(chk_k) ))
k = chk_k-1;
idx = idx_k(:,k);
return;
end
end
%% Didn't find a satisfactory k
k = chk_k;
idx = idx_k(:,end);
end
function [Wk,W_Bk, idx_k] = cluster_k(A, B, k, ge_method)
%% Cluster using graph-embedding algorithm/kmeans
v_pts = ge_method(A,k);
[Wk,idx_k] = eval_kmeans(v_pts,k);
%% Cheat by creating new random data per-k (don't bother with svd since already in eignespace)
d_min = min(v_pts,[],1);
d_max = max(v_pts,[],1);
rnd_data = (d_max-d_min).*rand([size(v_pts), B]) + d_min;
%%
W_Bk = zeros(B,1);
for i=1:B
W_Bk(i) = eval_kmeans(rnd_data(:,:,i), k);
end
end
function [Wk,idx_k] = eval_kmeans(X,k)
num_reps = 15;
idx_k = ones(size(X,1),1);
if ( k > 1 )
idx_k = kmeans(X,k, 'emptyaction','singleton', 'replicates',num_reps, 'maxiter',200);
end
%% Just use intracluster dispersion
Wk = 0;
for i=1:k
bK = idx_k==k;
Xk = X(bK,:);
Wk = Wk + 0.5*sum(var(Xk));
end
end
...@@ -9,14 +9,81 @@ end ...@@ -9,14 +9,81 @@ end
load(chkpt_list(end).name); load(chkpt_list(end).name);
%% Drop all but the largest connected component of G
bins = conncomp(G);
bincounts = arrayfun(@(x)(nnz(bins==x)) ,1:max(bins));
[~,binidx] = max(bincounts);
subnodes = find(bins == binidx);
% Keep track of original edgeIDs
G.Edges.OrgIDs = (1:G.numedges).';
subG = subgraph(G, subnodes);
%% Plot the graph to see what it currently looks like %% Plot the graph to see what it currently looks like
plot(G); plot(subG);
%% Create a weighted adjancency matrix to %% Create a weighted adjancency matrix to
nn = numnodes(G); % Normalize edge weights
[s,t] = findedge(G); G.Edges.Weight = G.Edges.Weight ./ max(G.Edges.Weight);
A = sparse(s,t,G.Edges.Weight,nn,nn);
nn = numnodes(subG);
[s,t] = findedge(subG);
A = sparse(s,t,subG.Edges.Weight,nn,nn);
A = max(A,A'); A = max(A,A');
%% Clustering using an arbitrary k (Laplacian Eigenmaps) %% Clustering using an arbitrary k (Laplacian Eigenmaps)
idx = Cluster.kmeans_le(A, 10); [k,idx] = Cluster.gap_ge(A, 20, @Cluster.LaplacianEigenmaps);
%% Get subreddits/titles per-group
[s,t] = findedge(subG);
E = sparse(s,t,subG.Edges.OrgIDs,nn,nn);
E = max(E,E');
subreddits = cell(k,1);
titles = cell(k,1);
for i=1:k
bCluster = (idx == i);
cE = E(bCluster,:);
cE = cE(:,bCluster);
cEdgeIDs = cE(cE>0);
linkIdxs = find(any(S(cEdgeIDs,:),1));
subsList = {links(linkIdxs).subreddit}.';
subreddits{i} = unique(subsList);
titles{i} = {};
for j=1:length(linkIdxs)
linkID = links(linkIdxs(j)).id;
titles{i} = [titles{i}; {link_map(linkID)}];
end
end
%% Get listings of common and unique subreddits per cluster
common_subs = cell(k,k);
unique_subs = cell(k,1);
for i=1:k
bCom = false(k,length(subreddits{i}));
for j=1:k
if (j==i)
continue;
end
bCom(j,:) = ismember(subreddits{i},subreddits{j}).';
common_subs{i,j} = subreddits{i}(bCom(j,:));
end
bUnique = all(bCom == 0, 1);
unique_subs{i} = subreddits{i}(bUnique);
end
%% Draw graph with nodes colored by cluster index
cmap = lines(k);
H = plot(subG);
for i=1:k
nodeIDs = find(idx==i);
highlight(H, nodeIDs, 'NodeColor',cmap(i,:));
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment