Commit 0b81ac9c authored by Andrew Cohen's avatar Andrew Cohen
Browse files

updated with +Count : reddit/api/throttling

parent 945d30e7
% TODO not supported yet
function f = GetCountGoogle(term)
global nQueryCount nCacheCount
......
% reddit query
function count=pubmed(term)
count=0;
persistent creds timerVal mapCache
if isempty(creds)
[creds.email,creds.appID]=getPubmedCredentials();
end
% throttle to 1 call per 1/3 second
tThrottle=1/3;
if ~isempty(timerVal)
elapsedTime=toc(timerVal);
deltaT=tThrottle-elapsedTime;
if deltaT>0
pause(deltaT);
end
end
if isempty(mapCache)
mapCache=containers.Map();
end
if mapCache.isKey(term)
count=mapCache(term);
return;
end
dd=query(term,creds);
timerVal=tic();
count=0;
if isempty(dd)
return
end
dparse=regexp(dd,'<Count>(?<count>\d+)','names');
count=sum(cellfun(@str2double,{dparse.count}));
mapCache(term)=count;
function dd=query(term,creds)
queryStr=['http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi?term=' term ...
'&email=' creds.email '&tool=' creds.appID];
tic;dd=webread(queryStr);toc
function [email,appID]=getPubmedCredentials()
email='you@yourServer.net';
appID='myInterestingAppName';
% return your email, appID here
mp=mfilename('fullpath');
folder=fileparts(mp);
credsFile=fullfile(folder,'pubmed.mat');
if ~exist(credsFile,'file')
fprintf(2,'no reddit.mat credentials file found.\n');
return;
end
load(credsFile,'email','appID');
% reddit query
function nUps=reddit(term)
nComments=0;
nUps=0;
persistent creds timerVal token mapCache
if isempty(creds)
[creds.client_id,creds.secret]=getRedditCredentials();
end
% throttle to 1 call per second
tThrottle=1;
if ~isempty(timerVal)
elapsedTime=toc(timerVal);
deltaT=tThrottle-elapsedTime;
if deltaT>0
pause(deltaT);
end
end
if isempty(mapCache)
mapCache=containers.Map();
end
if mapCache.isKey(term)
nUps=mapCache(term);
return;
end
if isempty(token)
token=getToken(creds.client_id,creds.secret);
end
dd=[];
nrep=0;
while nrep<2
try
dd=query(term,token);
break;
catch exception
nrep=nrep+1;
if nrep<2
token=getToken(creds.client_id,creds.secret);
else
fprintf(2,'%s\n',exception.message);
end
end
end
timerVal=tic();
count=0;
if isempty(dd)
return
end
if isempty(term) | strcmp(term,'')
nComments=1e9*dd.data.children(1).data.num_comments;
nUps=1e9*dd.data.children(1).data.ups;
else
for i=1:length(dd.data.children)
nComments=nComments+dd.data.children(i).data.num_comments;
nUps=nUps+dd.data.children(i).data.ups;
end
end
mapCache(term)=nUps;
function token=getToken(client_id,secret)
tokenOptions = weboptions('Username',client_id,'Password',secret);
token=webwrite('https://www.reddit.com/api/v1/access_token','grant_type=client_credentials',tokenOptions);
function dd=query(term,token)
if isempty(term) | strcmp(term,'')
dd=queryAll(token);
return
end
queryStr=['https://oauth.reddit.com/search?q=' term '&sort=top&limit=100'];
queryOptions=weboptions('HeaderFields',{'Authorization',['bearer ' token.access_token]});
tic;dd=webread(queryStr,queryOptions);toc
function dd=queryAll(token)
queryStr=['https://oauth.reddit.com/r/all/top/?t=all&limit=100'];
queryOptions=weboptions('HeaderFields',{'Authorization',['bearer ' token.access_token]});
dd=webread(queryStr,queryOptions);
function [client_id,secret]=getRedditCredentials()
client_id=[];
secret=[];
% obtain the following from reddit -- https://github.com/reddit-archive/reddit/wiki/OAuth2
% client_id='your_client_ID'
% secret='your_secret'
mp=mfilename('fullpath');
folder=fileparts(mp);
credsFile=fullfile(folder,'reddit.mat');
if ~exist(credsFile,'file')
fprintf(2,'no reddit.mat credentials file found.\n');
return;
end
load(credsFile,'client_id','secret');
% reddit query
function count=wikipedia(term)
persistent mapCache
if isempty(mapCache)
mapCache=containers.Map();
end
if mapCache.isKey(term)
count=mapCache(term);
return;
end
URL=['https://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
str = urlread(URL);
try
idx=strfind(str,'div class="results-info"');
idx=idx(1);
str2=str(idx:end);
idx2=strfind(str2,'</div>');
idx2=idx2(1);
str2=str2(1:idx2);
[s1 s2]=regexp(str2,'[0-9 ,]+');
count=str2double(str2(s1(end):s2(end)));
catch
count=0;
end
mapCache(term)=count;
......@@ -29,3 +29,5 @@ _ReSharper*/
[Tt]est[Rr]esult*
*.asv
~$*
*.mat
function f = GetCount(words)
function f = GetCount(words,strServer)
global nQueryCount nCacheCount strServer
if isempty(strServer)
if ~exist('strServer','var')
strServer='wikipedia'
end
......@@ -13,116 +11,48 @@ end
idx=strfind(term,' ');
term(idx)='+';
%check cache
global QueryCache
if ~isempty(QueryCache)
cc=strcmp(QueryCache.Queries,term);
if ~isempty(cc) && any(cc)
idx=find(cc);
f=QueryCache.Count(idx);
f=f-1;
nCacheCount=nCacheCount+1;
return
end
end
if strcmp(strServer,'pubmed')
f=doQueryPubmed(term);
f=Count.pubmed(term);
elseif strcmp(strServer,'amazon')
f=doQueryAmazon(term);
elseif strcmp(strServer,'google')
f=GetCountGoogle(term);
elseif strcmp(strServer,'reddit')
f=Count.reddit(term);
else
f=doQueryWikipedia(term);
f=Count.wikipedia(term);
end
% f=max(f,1); % no zero counts - sends nwd to NaN
f=f+1;
if ~isempty(QueryCache)
nQueryCount=nQueryCount+1;
QueryCache.Queries=[QueryCache.Queries {term}];
QueryCache.Count=[QueryCache.Count f];
end
end
function f=doQueryWikipedia(term)
URL=['https://en.wikipedia.org/w/index.php?title=Special%3ASearch&profile=default&search=' term '+&fulltext=Search'];
str = urlread(URL);
try
idx=strfind(str,'div class="results-info"');
idx=idx(1);
str2=str(idx:end);
idx2=strfind(str2,'</div>');
idx2=idx2(1);
str2=str2(1:idx2);
[s1 s2]=regexp(str2,'[0-9 ,]+');
f=str2double(str2(s1(end):s2(end)));
catch
f=0;
end
end
function f=doQueryPubmed(term)
URL = [ 'http://www.ncbi.nlm.nih.gov/gquery/?term=' term];
str = urlread(URL);
try
key = '<h2>About ';
idx1=strfind(str,key);
idx1=idx1+length(key);
idx2=strfind(str(idx1:end), 'search results for');
nx=str(idx1:idx1+idx2-2);
f=str2double(nx);
catch
f=0;
end
end
function f=doQueryAmazon(term)
URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
for i=1:5
str = urlread(URL);
try
%amazon
idx=strfind(str,'results for <span>');
idx=idx(1)-2;
idxStart=idx;
chPrev = str(idxStart-1);
while(isNumeric(chPrev))
idxStart=idxStart-1;
chPrev = str(idxStart-1);
end
nx=str(idxStart:idx);
f=str2double(nx);
break;
catch
f=0;
pause(1);
end
end
end
function bNumber = isNumeric(ch)
bNumber=1;
if ~isnan(str2double(ch))
return
end
if ch==','
return
end
bNumber=0;
end
% amazon no longer returning search totals as of 1/2019...
% function f=doQueryAmazon(term)
%
% URL=['http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=' term '&rh=i%3Aaps%2Ck%3A' term];
% for i=1:5
%
% str = urlread(URL);
% try
% %amazon
% idx=strfind(str,'results for <span>');
% idx=idx(1)-2;
% idxStart=idx;
% chPrev = str(idxStart-1);
% while(isNumeric(chPrev))
% idxStart=idxStart-1;
% chPrev = str(idxStart-1);
% end
% nx=str(idxStart:idx);
% f=str2double(nx);
% break;
% catch
% f=0;
% pause(1);
% end
% end
%
% end
function [d]=GetDistances(words,idxKey)
function [d,Confusion,nCorrect]=GetDistances(words,idxKey,strServer)
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
if ~exist('strServer','var')
strServer='wikipedia';
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
d=[];
rgNWDx=[];rgNWD=[];
......@@ -28,10 +21,10 @@ for i=1:length(words)
wordList=words(idx);
wordListX=[words(i) words(idx)];
end
rgNWDx(j,i)=NWD(wordListX);
rgNWD(j,i)=NWD(wordList);
rgNWDx(j,i)=NWD(wordListX,strServer);
rgNWD(j,i)=NWD(wordList,strServer);
end
d = rgNWDx-rgNWD;
idx=find(isnan(d) | isinf(d));
d(idx)=inf;
......@@ -46,19 +39,21 @@ for i=1:length(words)
[mm nPred] = min(d(:,i));
Confusion(nTrue,nPred)=Confusion(nTrue,nPred)+1;
end
fid=fopen('results.txt','a');
fprintf(fid,'\n\n-------------------------\n')
fprintf(fid,'GetDistances::(multiples) NW=5 \n');
c=clock;
c=num2str(c)
fprintf(fid,'%s\n',c);
for i=1:length(words)
fprintf(fid,'%s,',words{i});
nCorrect=sum(diag(Confusion));
bWriteFile=false;
if bWriteFile
fid=fopen('results.txt','a');
fprintf(fid,'\n\n-------------------------\n')
fprintf(fid,'GetDistances::(multiples) NW=5 \n');
c=clock;
c=num2str(c)
fprintf(fid,'%s\n',c);
for i=1:length(words)
fprintf(fid,'%s,',words{i});
end
% fprintf(fid,'nQueryCount=%d, nCacheCount=%d\n',nQueryCount, nCacheCount);
% fprintf(fid,'\n');
c=mat2str(Confusion);
fprintf(fid,'%s\n',c);
fclose(fid);
end
fprintf(fid,'nQueryCount=%d, nCacheCount=%d\n',nQueryCount, nCacheCount);
fprintf(fid,'\n');
c=mat2str(Confusion);
fprintf(fid,'%s\n',c);
fclose(fid);
% strServer in 'wikipedia','amazon','pubmed','google','reddit'
function [nwd rgCache]=NWD(words, strServer)
function [nwd rgCache]=NWD(words, varargin)
if nargin<2
bNWDmin=0;
else
bNWDmin=varargin{1};
if ~exist('strServer','var')
strServer='wikipedia';
end
fX=GetCount(words);
fX=GetCount(words,strServer);
fw=[];
fexclude=[];
for i=1:length(words)
fw(i)=GetCount(words(i));
% ww=words;
% ww(i)=[];
% fexclude(i)=GetCount(ww);
fw(i)=GetCount(words(i),strServer);
end
% N=GetCount({'the'});
% N=11e6 ;
N=4776196; % wikipedia!
% N=25672211*1e3; % amazon
% if bNWDmin
% nwd = (log2(min(fw))-log2(fX)) / (log2(N)-log2(min(fw))); % from xx.pdf
% else
% nwd = (log2(max(fw))-log2(fX)) / (log2(N)-log2(min(fw))); % III.3 from arxiv
% end
% nwd = (log2(max(fw))-log2(fX)) / (log2(N)-log2(min(fexclude))); % III.3 from arxiv
N=getN('reddit');
nwd = (log2(max(fw))-log2(fX)) / (log2(N)-log2(min(fw)));
%
% % normalize for cardinality
nwd = nwd/(length(words)-1);
% ww = length(words);
% renorm = log2(N/ww)/log2(ww);
% % renorm = log2( (1-1/ww) *N ) / log2(N/ww);
% nwd = nwd / renorm;
% % normalize for cardinality ?
% nwd = nwd/(length(words)-1);
4;
function N=getN(strServer)
N=0;
% list of objects that should achieve NWD=1
maxN={ {'einstein','fudge'},{'apple','xyzzy'},{'imbecile','newton'}};
cn=[];
for i=1:length(maxN)
words=maxN{i};
fX=GetCount(words,strServer);
fw=[];
fexclude=[];
for j=1:length(words)
fw(j)=GetCount(words(j),strServer);
end
% nwd = 1 = (log2(max(fw))-log2(fX)) / (log2(N)-log2(min(fw)));
cn(i)=2^((log2(max(fw))-log2(fX))+log2(min(fw)));
end
N=median(cn);
% authors
% words= {
% {'Macbeth','The Tempest','Othello','King Lear','Hamlet'...
% 'The Merchant of Venice','A Midsummer Nights Dream',...
% 'Much Ado About Nothing', 'Taming of the Shrew','Twelfth Night' }...
% {'Carrie','Salems Lot','The Shining','The Stand','The Dead Zone',...
% 'Firestarter','Cujo'}...
% {'Adventures of Huckleberry Finn','A Connecticut Yankee in King Arthurs Court','Life on the Mississippi'...
% 'Puddnhead Wilson'}...
% {'The Old Man and The Sea','The Sun Also Rises','For Whom the Bell Tolls','A Farewell To Arms'}...
% {'Anna Karenina','War and Peace','The Death of Ivan Ilyich'}...
% };
%
% famous
% words={
% {'Isamu Akasaki', 'Hiroshi Amano', 'Shuji Nakamura','Eric Betzig', 'Stefan W. Hell', 'William E. Moerner','John O''Keefe', 'May-Britt Moser', 'Edvard I. Moser' }
% {'Albert Einstein','Isaac Newton','Stephen Hawking','Nils Bohr','James Maxwell'}
% {'Amedeo Avogadro','Louis Pasteur','Linus Pauling','Robert Boyle'}
% {'Elizabeth Blackwell','William Harvey','Carl Jung','Richard Lister'}
% } ;
% scientists
words={
{'Kolmogorov','Fermat','Hilbert', 'Godel', 'Riemann','Gauss'} % mathematicians
{'Einstein','Newton','Hawking','Bohr','Maxwell','Boltzmann',} %physicists
{'Freud','Pavlov','Skinner','Jung'} %psychologists
{'Turing','Lovelace','Knuth','Hopper'} % computer scientists
{'Darwin','Lamarck','Linnaeus','Mendel'} % biology
}
%colors animals
words={
{'red','orange','yellow','green','blue','indigo','violet'}
{'lion','tiger','bear','monkey','zebra','elephant','aardvark','lamb','fox','ape','dog'}
};
words={
{'Barack Obama','Hillary Clinton','John Edwards','Joe Biden','Chris Dodd','Mike Gravel'}
{'John McCain','Mitt Romney','Mike Huckabee','Ron Paul','Fred Thompson','Alan Keyes'}
};
idxKey=[];
for i=1:length(words)
idxKey=[idxKey,repmat(i,1,length(words{i}))];
end
strServer='reddit'
[d,Confusion,nCorrect]=GetDistances([words{:}],idxKey,strServer)
[idx,nCorrect]=getDistancesPairs([words{:}],idxKey,strServer)
[idxNN,nCorrectNN]=getDistanceNearestNeighbor([words{:}],idxKey,strServer)
function [idxNN,nCorrect]=getDistanceNearestNeighbor(words,idxKey,strServer)
if ~exist('strServer','var')
strServer='wikipedia';
end
idxNN=[];
d=Inf;
for i=1:length(words)
for j=1:length(words)
if i==j
continue
end
d(j)=NWD([words(i),words(j)],strServer);
end
[dmin,idxmin]=min(d);
idxNN(i)=idxKey(idxmin);
end
nCorrect=length(find(idxNN==idxKey));
\ No newline at end of file
function idx=GetDistancesPairs(words,idxKey)
d=[];
function [idx,nCorrect]=GetDistancesPairs(words,idxKey,strServer)
global QueryCache
if isempty(QueryCache)
QueryCache.Queries={};
QueryCache.Count=[];
if ~exist('strServer','var')
strServer='wikipedia';
end
global nQueryCount nCacheCount
nQueryCount=0;
nCacheCount=0;
path(path,'..\Gap');
d=[];
for i=1:length(words)
for j=i+1:length(words)
d(i,j)=NWD([words(i),words(j)]);
d(i,j)=NWD([words(i),words(j)],strServer);
end
end
......@@ -41,21 +37,27 @@ for i=1:size(idxPerms,1)
end
[nCorrect iBest]=max(rgCorrect);
idxWrong = rgWrong(iBest);
idx=idxCluster;
fid=fopen('results.txt','a');
fprintf(fid,'\n\n-------------------------\n')
fprintf(fid,'GetDistances::(pairs) \n');
c=clock;
c=num2str(c)
fprintf(fid,'%s\n',c);
for i=1:length(words)
fprintf(fid,'%s,',words{i});
% set the output clustering to match the permutation with the min error
% rate -- just 'cause
idx=idxCluster';
idxMap=idxPerms(iBest,:);
for i=1:length(idxMap);idx(idxCluster==idxMap(i))=i;end
bWriteFile=false;
if bWriteFile
fid=fopen('results.txt','a');
fprintf(fid,'\n\n-------------------------\n')
fprintf(fid,'GetDistances::(pairs) \n');
c=clock;
c=num2str(c)
fprintf(fid,'%s\n',c);
for i=1:length(words)
fprintf(fid,'%s,',words{i});
end
fprintf(fid,'kGap=%d\n',kGap);
% fprintf(fid,'nQueryCount=%d, nCacheCount=%d\n',nQueryCount, nCacheCount);
% fprintf(fid,'nCorrect=%d\n',nCorrect);
fclose(fid);
end
fprintf(fid,'kGap=%d\n',kGap);