Skip to content
Snippets Groups Projects
Commit 07c92ecf authored by Mark Winter's avatar Mark Winter
Browse files

Initial reddit testing code

parent c0e84098
Branches
No related tags found
No related merge requests found
% Produce k-dimensional eigenvectors of symmetric Laplacian of A
function v_pts = LaplacianEigenmaps(A,k)
D = zeros(size(A));
for i=1:size(A,1)
D(i,i)=sum(A(i,:));
end
L = D - A;
L = D^(-.5) * L * D^(-.5);
% Regularize the affinity matrix
for i=1:size(L,1)
for j= 1:size(L,2)
L(i,j)= max(L(i,j),L(j,i));
end
L(i,i)=0;
end
[eVec,eVal] = eig(L);
evals = diag(eVal);
[evals,srtIdx] = sort(evals);
v_pts = eVec(:,srtIdx(2:k+1));
end
function idx = kmeans_le(A,k)
num_reps = 15;
X = Cluster.LaplacianEigenmaps(A,k);
idx = kmeans(X,k, 'emptyaction','singleton', 'replicates',num_reps);
end
classdef RedditToken < handle
properties
ClientID
ClientSecret
AccessToken
TokenType
UserAgent
RequestsLeft
RequestResetTime
ExpireTime
end
methods
function obj = RedditToken(creds, agent)
obj.ClientID = creds.client_id;
obj.ClientSecret = creds.client_secret;
obj.UserAgent = agent;
getAuthToken(obj);
end
function getAuthToken(obj)
tokenOptions = weboptions('Username',obj.ClientID,'Password',obj.ClientSecret);
token = webwrite('https://www.reddit.com/api/v1/access_token', 'grant_type=client_credentials', tokenOptions);
% Set access token info
obj.AccessToken = token.access_token;
obj.TokenType = token.token_type;
obj.ExpireTime = datetime('now') + seconds(token.expires_in);
% Default to a request per second
obj.RequestsLeft = 1;
obj.RequestResetTime = datetime('now');
end
end
end
function tokenObj = reddit_auth(credStruct)
agentStr = make_useragent(credStruct);
tokenObj = RedditToken(credStruct, agentStr);
% Add in some empty request fields for rate-limiting
% NOTE: Matlab doesn't support response headers so these are just
% rate-limiting the requests to ~1 per second
end
function agentStr = make_useragent(credStruct)
% Setup extra fields for requests
% Set user-agent properly or there's a high chance of being blocked (see https://github.com/reddit-archive/reddit/wiki/API)!
% <platform>:<app ID>:<version string> (by /u/<reddit username>
if ( ispc() )
archStr = 'Windows';
elseif ( ismac() )
archStr = 'Macintosh';
else
archStr = 'Linux';
end
agentStr = sprintf('%s:%s:v%s (by /u/%s)', archStr, credStruct.app_name, credStruct.app_ver, credStruct.username);
end
function G = reddit_graph(tokenObj)
%% Get a 1-hr token
if ( ~exist('tokenObj','var') || isempty(tokenObj) )
credStruct = load_reddit_creds('reddit_creds.json');
tokenObj = reddit_auth(credStruct);
end
% A few constants
graph_params = struct('max_depth',{3}, 'link_limit',{10}, 'comment_limit',{10}, 'ignore_users',{{'[deleted]';'AutoModerator'}});
%% Load first 100 /r/<subreddit>/hot entries and start building user graph
subreddit = 'math';
req = sprintf('r/%s/hot?limit=%d', subreddit, graph_params.link_limit);
r = reddit_query(req, tokenObj);
%%
G = graph();
% chk_links = struct('id',{}, 'subreddit',{}, 'user',{}, 'depth',{});
link_map = containers.Map('KeyType','char', 'ValueType','char');
%%
popular_links = r.data.children;
chk_links = create_new_links(popular_links, 0, link_map, graph_params);
% Run through all listings breadth-first
while ( ~isempty(chk_links) )
link = chk_links(1);
[new_links, G] = process_link(link, G, link_map, graph_params, tokenObj);
chk_links = [chk_links(2:end); new_links];
end
end
function [new_links, G] = process_link(link, G, link_map, graph_params, tokenObj)
new_links = [];
if ( link.depth > graph_params.max_depth )
return;
end
%% Process links to create user nodes/weight edges
% Get all users that have commented on link as well as original author
link_users = find_link_users(link, tokenObj);
bIgnoreUsers = ismember(link_users, graph_params.ignore_users);
link_users = link_users(~bIgnoreUsers);
[new_users, G] = update_graph(link_users, G);
%% Process users to find new links
% Don't bother adding new links if we're at max_depth
if ( link.depth == graph_params.max_depth )
return;
end
new_links = add_user_links(new_users, link.depth+1, link_map, graph_params, tokenObj);
end
function link_users = find_link_users(link, tokenObj)
% Always start with the original post user
link_users = {link.user};
% Drop the type-prefix from link fullname to get link id
link_id = link.id(4:end);
% Get listing of comments on link
req = sprintf('/r/%s/comments/%s?threaded=false', link.subreddit, link_id);
r = reddit_query(req, tokenObj);
comment_list = r(2).data.children;
bValid = arrayfun(@(x)(isfield(x.data,'author')), comment_list);
comment_list = comment_list(bValid);
comment_users = arrayfun(@(x)(x.data.author), comment_list, 'UniformOutput',false);
comment_users = unique(comment_users, 'stable');
link_users = unique([link_users; comment_users], 'stable');
end
function new_links = add_user_links(new_users, next_depth, link_map, graph_params, tokenObj)
new_links = [];
for i=1:length(new_users)
nl = query_user_links(new_users{i}, next_depth, link_map, graph_params, tokenObj);
new_links = [new_links; nl];
end
end
function new_links = query_user_links(user_id, next_depth, link_map, graph_params, tokenObj)
%% Get newest link_limit posts (links)
req = sprintf('/user/%s/submitted?limit=%d&sort=new', user_id, graph_params.link_limit);
r = reddit_query(req, tokenObj);
posts_list = r.data.children;
%% Get links associated with newest comment_limit comments
req = sprintf('/user/%s/comments?limit=%d&sort=new', user_id, graph_params.comment_limit);
r = reddit_query(req, tokenObj);
% Get link ids user has commented on
comment_list = r.data.children;
comment_link_ids = arrayfun(@(x)(x.data.link_id), comment_list, 'UniformOutput',false);
if ( ~isempty(comment_link_ids) )
comment_link_ids = unique(comment_link_ids, 'stable');
batch_size = 10;
for i=1:batch_size:length(comment_link_ids)
batch_idx = i:min((i+batch_size-1),length(comment_link_ids));
req = sprintf('/by_id/%s', strjoin(comment_link_ids(batch_idx), ','));
r = reddit_query(req, tokenObj);
comment_links_list = r.data.children;
posts_list = [posts_list; comment_links_list];
end
end
new_links = create_new_links(posts_list, next_depth, link_map, graph_params);
end
function new_links = create_new_links(link_listing, depth, link_map, graph_params)
new_links = [];
link_ids = arrayfun(@(x)(x.data.name), link_listing, 'UniformOutput',false);
[link_ids, ia] = unique(link_ids, 'stable');
% Drop duplicate link ids
unique_links = link_listing(ia);
% Drop all previously explored links
bExplored = isKey(link_map, link_ids);
link_ids = link_ids(~bExplored);
unique_links = unique_links(~bExplored);
% Drop links posted by ignored users
link_users = arrayfun(@(x)(x.data.author), unique_links, 'UniformOutput',false);
bIgnored = ismember(link_users, graph_params.ignore_users);
link_ids = link_ids(~bIgnored);
link_users = link_users(~bIgnored);
unique_links = unique_links(~bIgnored);
link_titles = arrayfun(@(x)(x.data.title), unique_links, 'UniformOutput',false);
link_subreddit = arrayfun(@(x)(x.data.subreddit), unique_links, 'UniformOutput',false);
if ( isempty(link_ids) )
return;
end
% Create new links
new_links = struct('id',link_ids, 'subreddit',link_subreddit,...
'user',link_users, 'depth',repmat({depth},length(link_ids),1));
% Update link map
for i=1:length(link_ids)
link_map(link_ids{i}) = link_titles{i};
end
end
function [new_users, G] = update_graph(link_users, G)
nodeIDs = zeros(length(link_users),1);
if ( G.numnodes > 0 )
nodeIDs = findnode(G, link_users);
end
% Add new users to graph
new_users = link_users(nodeIDs == 0);
if ( ~isempty(new_users) )
G = addnode(G, new_users);
nodeIDs = findnode(G, link_users);
end
% Get a list of user-user edges
[SS,TT] = meshgrid(nodeIDs,nodeIDs);
s = SS(:);
t = TT(:);
% Make sure the list is unique
bValid = s < t;
s = s(bValid);
t = t(bValid);
edgeIDs = findedge(G, s,t);
bValidEdges = (edgeIDs > 0);
% Add weight to existing edges
edgeIDs = edgeIDs(bValidEdges);
if ( ~isempty(edgeIDs) )
G.Edges.Weight(edgeIDs) = G.Edges.Weight(edgeIDs) + 1;
end
% Add new edges
newS = s(~bValidEdges);
newT = t(~bValidEdges);
if ( ~isempty(newS) )
G = addedge(G, newS,newT, 1);
end
end
% Get request to a reddit api endpoint
% See: https://www.reddit.com/dev/api/
function resp = reddit_query(endpoint, tokenObj)
if ( endpoint(1) == '/' )
endpoint = endpoint(2:end);
end
%% Re-auth if token has expired
expire_reset = seconds(tokenObj.ExpireTime - datetime('now'));
if ( expire_reset <= 0 )
getAuthToken(tokenObj);
end
%% Rate-limit requests
if ( tokenObj.RequestsLeft <= 0 )
wait_reset = seconds(tokenObj.RequestResetTime - datetime('now'));
% Always wait at least 0.5 seconds, jto pad requests a little
if ( wait_reset > 0 )
wait_reset = max(wait_reset, 0.5);
pause(wait_reset);
end
tokenObj.RequestsLeft = 1;
end
%%
url = sprintf('https://oauth.reddit.com/%s', endpoint);
headers = {'Authorization', sprintf('%s %s', tokenObj.TokenType, tokenObj.AccessToken)};
opts = weboptions('UserAgent',tokenObj.UserAgent,...
'Timeout',60,...
'HeaderFields',headers);
resp = webread(url, opts);
% Manually reset rate-limits to avoid overloading since matlab doesn't
% support headers
tokenObj.RequestsLeft = tokenObj.RequestsLeft - 1;
tokenObj.RequestResetTime = datetime('now') + seconds(1.0);
end
%% Get a 1-hr token
credStruct = load_reddit_creds('reddit_creds.json');
tokenObj = reddit_auth(credStruct);
%% Generate a user-graph from Reddit
G = reddit_graph(tokenObj);
%% Create a weighted adjancency matrix to
nn = numnodes(G);
[s,t] = findedge(G);
A = sparse(s,t,G.Edges.Weight,nn,nn);
%% Clustering using an arbitrary k (Laplacian Eigenmaps)
idx = Cluster.kmeans_le(A, 10);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment