Skip to content
Snippets Groups Projects
Commit e569883b authored by Mark Winter's avatar Mark Winter
Browse files

Run partial checkpointed data with Laplacian Eigenmaps

parent 6f1aa502
Branches
No related tags found
No related merge requests found
*.zip
*.mat
function G = reddit_graph(tokenObj)
function [G,S,links,link_map] = reddit_graph(tokenObj)
%% Get a 1-hr token
if ( ~exist('tokenObj','var') || isempty(tokenObj) )
credStruct = load_reddit_creds('reddit_creds.json');
......@@ -8,32 +8,45 @@ function G = reddit_graph(tokenObj)
% A few constants
graph_params = struct('max_depth',{3}, 'link_limit',{10}, 'comment_limit',{10}, 'ignore_users',{{'[deleted]';'AutoModerator'}});
%% Load first 100 /r/<subreddit>/hot entries and start building user graph
%% Load first link_limit /r/<subreddit>/hot entries and start building user graph
subreddit = 'math';
req = sprintf('r/%s/hot?limit=%d', subreddit, graph_params.link_limit);
r = reddit_query(req, tokenObj);
%%
%% Major structures for checkpointing and graph-linkup
% This will be the user relationship graph
G = graph();
% chk_links = struct('id',{}, 'subreddit',{}, 'user',{}, 'depth',{});
% Sparse bipartite map from Reddit posts (links) to user-graph edgeIDs
S = sparse([]);
% Also, keep search-ordered listing of all Reddit links
links = struct('id',{}, 'subreddit',{}, 'user',{}, 'depth',{});
% Map Reddit post (linkIDs) to post titles
link_map = containers.Map('KeyType','char', 'ValueType','char');
%%
%% Seed the link listing with the popular entries
popular_links = r.data.children;
chk_links = create_new_links(popular_links, 0, link_map, graph_params);
seed_links = create_new_links(popular_links, 0, link_map, graph_params);
[links,S] = add_links(seed_links, links,S);
% Run through all listings breadth-first
while ( ~isempty(chk_links) )
link = chk_links(1);
[new_links, G] = process_link(link, G, link_map, graph_params, tokenObj);
link_idx = 1;
while ( link_idx <= length(links) )
[new_links,G,S] = process_link(links,link_idx, G,S,link_map, graph_params, tokenObj);
[links,S] = add_links(new_links, links,S);
link_idx = link_idx + 1;
chk_links = [chk_links(2:end); new_links];
reddit_search_checkpoint(G,S,links,link_map, link_idx);
end
end
function [new_links, G] = process_link(link, G, link_map, graph_params, tokenObj)
%% Process a Reddit post (find all users associated with post and find links associated with those users)
function [new_links,G,S] = process_link(links,link_idx, G,S,link_map, graph_params, tokenObj)
new_links = [];
link = links(link_idx);
if ( link.depth > graph_params.max_depth )
return;
......@@ -46,7 +59,7 @@ function [new_links, G] = process_link(link, G, link_map, graph_params, tokenObj
link_users = link_users(~bIgnoreUsers);
[new_users, G] = update_graph(link_users, G);
[new_users,G,S] = update_graph(link_users,link_idx, G,S);
%% Process users to find new links
% Don't bother adding new links if we're at max_depth
......@@ -57,6 +70,28 @@ function [new_links, G] = process_link(link, G, link_map, graph_params, tokenObj
new_links = add_user_links(new_users, link.depth+1, link_map, graph_params, tokenObj);
end
%% Checkpoint this search to a file for partial results checks and for restarting
function reddit_search_checkpoint(G,S,links,link_map,link_idx)
save_struct = struct('G',{G},...
'S',{S},...
'links',{links},...
'link_map',{link_map},...
'link_idx',{link_idx});
max_keep_chkpt = 3;
chkpoint_prefix = 'reddit_chkpoint';
now_string = datestr(now, 'yyyy-mm-dd_HH-MM-SS');
filename = sprintf('%s_%s.mat', chkpoint_prefix, now_string);
save(filename, '-struct','save_struct');
chkpt_list = dir([chkpoint_prefix '_*.mat']);
if ( length(chkpt_list) > max_keep_chkpt )
rm_list = {chkpt_list(1:(end-max_keep_chkpt)).name};
delete(rm_list{:});
end
end
%% Find all users who posted or commented on current post (linkID)
function link_users = find_link_users(link, tokenObj)
% Always start with the original post user
link_users = {link.user};
......@@ -78,6 +113,7 @@ function link_users = find_link_users(link, tokenObj)
link_users = unique([link_users; comment_users], 'stable');
end
%% Add new posts (linkIDs) from new users discovered in current post being processed
function new_links = add_user_links(new_users, next_depth, link_map, graph_params, tokenObj)
new_links = [];
for i=1:length(new_users)
......@@ -86,6 +122,7 @@ function new_links = add_user_links(new_users, next_depth, link_map, graph_param
end
end
%% Get a limited number of posts and comments a user has created
function new_links = query_user_links(user_id, next_depth, link_map, graph_params, tokenObj)
new_links = [];
......@@ -127,6 +164,17 @@ function new_links = query_user_links(user_id, next_depth, link_map, graph_param
new_links = create_new_links(posts_list, next_depth, link_map, graph_params);
end
%% Add new links to main link exploration list and make new bipartite map nodes
% NOTE: We assume the links have already been verified to be new unique
% entries, see create_new_links() for that functionality
function [links,S] = add_links(new_links, links,S)
links = [links; new_links];
num_edges = size(S,1);
S = [S, zeros(num_edges,length(new_links))];
end
%% Create link structures for unexplored new links
function new_links = create_new_links(link_listing, depth, link_map, graph_params)
new_links = [];
......@@ -166,7 +214,8 @@ function new_links = create_new_links(link_listing, depth, link_map, graph_param
end
end
function [new_users, G] = update_graph(link_users, G)
%% Add users and edges to user-graph for a particular reddit post
function [new_users,G,S] = update_graph(link_users,link_idx, G,S)
nodeIDs = zeros(length(link_users),1);
if ( G.numnodes > 0 )
nodeIDs = findnode(G, link_users);
......@@ -201,7 +250,15 @@ function [new_users, G] = update_graph(link_users, G)
% Add new edges
newS = s(~bValidEdges);
newT = t(~bValidEdges);
newEdgeIDs = [];
if ( ~isempty(newS) )
G = addedge(G, newS,newT, 1);
newEdgeIDs = findedge(G, newS,newT);
end
% Create mapping between linkID and edgeIDs
num_links = size(S,2);
all_edges = [edgeIDs;newEdgeIDs];
S = [S; zeros(length(newEdgeIDs),num_links)];
S(all_edges,link_idx) = 1;
end
%% Load most recent checkpoint
ckpt_prefix = 'reddit_chkpoint';
chkpt_list = dir([ckpt_prefix '_*.mat']);
if ( isempty(chkpt_list) )
warning('No checkpoints found in current directory!');
return;
end
load(chkpt_list(end).name);
%% Plot the graph to see what it currently looks like
plot(G);
%% Create a weighted adjancency matrix to
nn = numnodes(G);
[s,t] = findedge(G);
A = sparse(s,t,G.Edges.Weight,nn,nn);
A = max(A,A');
%% Clustering using an arbitrary k (Laplacian Eigenmaps)
idx = Cluster.kmeans_le(A, 10);
......@@ -3,7 +3,7 @@ credStruct = load_reddit_creds('reddit_creds.json');
tokenObj = reddit_auth(credStruct);
%% Generate a user-graph from Reddit
G = reddit_graph(tokenObj);
[G,S,links,link_map] = reddit_graph(tokenObj);
%% Create a weighted adjancency matrix to
nn = numnodes(G);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment