#!/usr/bin/ruby
#All the requirements
require "cgi"
require "net/http"
require "uri"
require "date"
def cutlink(href)
h_array = href.split(//)
link = "";
inlink = false;
0.upto(h_array.length-1) { |i|
if(inlink)
then
if(h_array[i] == "\"")
then
inlink = false;
break;
end
if(h_array[i] == "\n") then next end
if(h_array[i] == "'") then next end
link += h_array[i]
next
end
if(h_array[i] == "\"")
then
inlink = true
end
}
return link
end
def cutlinks(body)
linkarray = ""
bodyarray = body.split(/ /)
0.upto(bodyarray.length-1) { |i|
if(bodyarray[i].to_s[0..3].downcase == "href")
then
#the @ character is used to split this into an array of links
linkarray += cutlink(bodyarray[i]).downcase+"@"
end
}
return linkarray
end
def sortlinks(links)
newlinks = []
linkcount = 0
0.upto(links.length-1) { |i|
if(links[i].include? "search?q=cache:") then next end #if it's part of google caching system
if(links[i][0..0] == "/") then next end #if it's a local link
if(links[i].include? "google") then next end #if it's something from google (mail, docs, etc)
if(links[i].include? "\n") then next end #links shouldn't have newline characters
if(links[i].include? "froogle") then next end #links shouldn't be store items
if(links[i].include? "Froogle") then next end #links shouldn't be store items
if(links[i].include? "cm_mmc=seo-_-feeds") then next end
if(links[i].include? "youtube.com/results?q") then next end #links shouldn't be youtube searches
if(links[i] == nil) then next end #if the link is null
if(links[i].include? " ") then next end #links shouldn't have spaces
if(links[i][7..9] == "209") then next end #these were used when I manually set google's cache servers
if(links[i][7..9] == "64.") then next end #changed in favor of the "search?q=cache:"
if(links[i][7..9] == "72.") then next end
if(links[i][0..4] == "https") then next end #Links shouldn't use secure socket layer
newlinks[linkcount] = links[i] #if it passes the above, it's added to the link array
linkcount += 1
}
return newlinks
end
def google(term)
url="http://www.google.com/search?q="+term+"&num=10&hl=en&lr=&as_qdr=all&start=0&sa=N"
body = ""
#Iterates through the url for the specified searches
#may unroll this to default to all (1000) results
#may also thread this to speed it up a bit
request = Net::HTTP.get_response(URI.parse(url))
body = request.body
return body
end
#Program start
#host = "localhost"
#user = "minehowe_default"
#pass = "babyoil"
#db = "minehowe_keywords"
#cgi = CGI.new
#fname = cgi['filename']
fname = "altkeywords.txt"
domain = "wikipedia.org"
source = File.open("../uploads/"+fname, "r")
sink = File.open("output.csv","w")
terms = []
i = 0;
term = source.gets
while term != nil
terms[i] = term
i += 1
term = source.gets
end
0.upto(terms.length-1){ |j|
terms[j].chomp
search_array = terms[j].split(//)
terms[j] = ""
0.upto(search_array.length-1) { |i|
if(search_array[i] == " ") then
terms[j] += "+" #this adds the required "+" char in between the terms
next
end
terms[j] += search_array[i]
}
}
bodys = []
0.upto(terms.length-1) { |j|
bodys[j] = google(terms[j])
}
links = []
print "Content-type: text/html\n\n"
0.upto(terms.length-1) { |j|
links[j] = cutlinks(bodys[j])
links[j] = links[j].split(/@/)
links[j] = sortlinks(links[j])
links[j].uniq!
}
final = []
place = 0
0.upto(terms.length-1) { |j|
linkarray = links[j]
flink = "not found"
9.downto(0) {|k|
if(linkarray[k] != nil) then
if(linkarray[k].include? domain) then
flink = linkarray[k]
place = k+1
end
end
}
final[j] = terms[j].chomp+","+flink+","+place.to_s
# print flink+"
\n"
# formatted = terms[j]+","+flink+","+place.to_s
# print formatted+"
"
}
0.upto(terms.length-1) { |j|
print final[j]+"\n"
}