#!/usr/bin/ruby #All the requirements require "cgi" require "net/http" require "uri" require "date" def cutlink(href) h_array = href.split(//) link = ""; inlink = false; 0.upto(h_array.length-1) { |i| if(inlink) then if(h_array[i] == "\"") then inlink = false; break; end if(h_array[i] == "\n") then next end if(h_array[i] == "'") then next end link += h_array[i] next end if(h_array[i] == "\"") then inlink = true end } return link end def cutlinks(body) linkarray = "" bodyarray = body.split(/ /) 0.upto(bodyarray.length-1) { |i| if(bodyarray[i].to_s[0..3].downcase == "href") then #the @ character is used to split this into an array of links linkarray += cutlink(bodyarray[i]).downcase+"@" end } return linkarray end def sortlinks(links) newlinks = [] linkcount = 0 0.upto(links.length-1) { |i| if(links[i].include? "search?q=cache:") then next end #if it's part of google caching system if(links[i][0..0] == "/") then next end #if it's a local link if(links[i].include? "google") then next end #if it's something from google (mail, docs, etc) if(links[i].include? "\n") then next end #links shouldn't have newline characters if(links[i].include? "froogle") then next end #links shouldn't be store items if(links[i].include? "Froogle") then next end #links shouldn't be store items if(links[i].include? "cm_mmc=seo-_-feeds") then next end if(links[i].include? "youtube.com/results?q") then next end #links shouldn't be youtube searches if(links[i] == nil) then next end #if the link is null if(links[i].include? " ") then next end #links shouldn't have spaces if(links[i][7..9] == "209") then next end #these were used when I manually set google's cache servers if(links[i][7..9] == "64.") then next end #changed in favor of the "search?q=cache:" if(links[i][7..9] == "72.") then next end if(links[i][0..4] == "https") then next end #Links shouldn't use secure socket layer newlinks[linkcount] = links[i] #if it passes the above, it's added to the link array linkcount += 1 } return newlinks end def google(term) url="http://www.google.com/search?q="+term+"&num=10&hl=en&lr=&as_qdr=all&start=0&sa=N" body = "" #Iterates through the url for the specified searches #may unroll this to default to all (1000) results #may also thread this to speed it up a bit request = Net::HTTP.get_response(URI.parse(url)) body = request.body return body end #Program start #host = "localhost" #user = "minehowe_default" #pass = "babyoil" #db = "minehowe_keywords" #cgi = CGI.new #fname = cgi['filename'] fname = "altkeywords.txt" domain = "wikipedia.org" source = File.open("../uploads/"+fname, "r") sink = File.open("output.csv","w") terms = [] i = 0; term = source.gets while term != nil terms[i] = term i += 1 term = source.gets end 0.upto(terms.length-1){ |j| terms[j].chomp search_array = terms[j].split(//) terms[j] = "" 0.upto(search_array.length-1) { |i| if(search_array[i] == " ") then terms[j] += "+" #this adds the required "+" char in between the terms next end terms[j] += search_array[i] } } bodys = [] 0.upto(terms.length-1) { |j| bodys[j] = google(terms[j]) } links = [] print "Content-type: text/html\n\n" 0.upto(terms.length-1) { |j| links[j] = cutlinks(bodys[j]) links[j] = links[j].split(/@/) links[j] = sortlinks(links[j]) links[j].uniq! } final = [] place = 0 0.upto(terms.length-1) { |j| linkarray = links[j] flink = "not found" 9.downto(0) {|k| if(linkarray[k] != nil) then if(linkarray[k].include? domain) then flink = linkarray[k] place = k+1 end end } final[j] = terms[j].chomp+","+flink+","+place.to_s # print flink+"
\n" # formatted = terms[j]+","+flink+","+place.to_s # print formatted+"
" } 0.upto(terms.length-1) { |j| print final[j]+"\n" }