#Calavrezo Dan 352C2 import urllib, sgmllib,MySQLdb #functia care extrage date din ceea ce am parsat def calc_id (value): vid=[] vid.append((value[8:value.find("--")])) # aflam id-ul prietenului if vid[0].isdigit() == 0: vid=[] return vid vid.append(value[value.find("--")+2:value.find("--",value.find("--")+2)]) #numele lui vid.append("http://www.hi5.com"+value) #adresa profilului lui return vid class SimpleHTMLParser(sgmllib.SGMLParser): def parse(self, s): self.feed(s) self.close() def __init__(self, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.topfriends=[] self.profilecomments=[] self.inside_a_element=0 self.id=[] def start_a(self,attributes): # parsam ceea ce intalnim intr-un a tag temp="" # e un mic automat pentru a extrage doar informatiile necesare for name, value in attributes: if name =="id" and value =="profile-nav-profile" and self.inside_a_element==0: self.inside_a_element=2 continue if name == "name" and value.startswith("&lid=Profile_Friends_Firstname") and self.inside_a_element==0: self.inside_a_element = 1 continue if name =="href" and self.inside_a_element == 1 : if value.startswith("/friend"): self.topfriends.append(value) continue if name =="href" and self.inside_a_element == 2 : if value.startswith("/friend"): self.id.append(value) continue if name == "href" and self.inside_a_element ==0 : temp=value self.inside_a_element=3 continue if name =="name" and value.startswith("&lid=Profile_CommentsImage") and self.inside_a_element==3: if len(temp)>0: self.profilecomments.append(temp) continue def end_a(self): self.inside_a_element = 0 def get_topfriends(self): return self.topfriends def get_profilecomments(self): self.profilecomments=set(self.profilecomments) return self.profilecomments def get_id(self): return self.id # functia crawlerului def crawler(site, adancime): if adancime==0: print adancime,site return fisierHtml = urllib.urlopen(site) s = fisierHtml.read() parser =SimpleHTMLParser() parser.parse(s) # aflam mai intai id-ul userului curent a1=parser.get_id() if len(a1)==0:# in cazul in care id-ul sau nu mai este valabil print "Profil sters" return b1=calc_id(a1[0]) if len(b1)==3: cursor.execute("SELECT userid FROM tema2 WHERE userid=%s",b1[0]) #il introducem in baza de date result=cursor.fetchall() #in caz ca nu mia exista if len(result) ==0: cursor.execute("INSERT INTO tema2 (userid, nume,adresa) VALUES (%s, %s, %s)",(b1[0], b1[1],b1[2])) # introducem si prietenii in baza de date in caz ca nu mai exista a= parser.get_topfriends() for i in a: b=calc_id(i) if len(b)==3: cursor.execute("SELECT userid FROM tema2 WHERE userid=%s",b[0]) result=cursor.fetchall() if len(result) == 0: cursor.execute("INSERT INTO tema2 (userid, nume,adresa) VALUES (%s, %s, %s)",(b[0], b[1],b[2])) crawler(b[2],adancime-1) cursor.execute("INSERT INTO relatii (userid1, userid2,tip) VALUES (%s, %s, %s)",(b[0], b1[0],"topfriend")) # introducem in table relatii si tipul de relatii - daca este comment sau in top friends a= parser.get_profilecomments() for i in a: b=calc_id(i) if len(b)==3: cursor.execute("SELECT userid FROM tema2 WHERE userid=%s",b[0]) result=cursor.fetchall() if len(result) == 0: cursor.execute("INSERT INTO tema2 (userid, nume,adresa) VALUES (%s, %s, %s)",(b[0], b[1],b[2])) crawler(b[2],adancime-1) cursor.execute("INSERT INTO relatii (userid1, userid2,tip) VALUES (%s, %s, %s)",(b[0], b1[0],"cometariu profil")) #functia main # ne conectam la baza de date online #setam adancimea de crawling precum si profilul de start if __name__=="__main__": db=MySQLdb.connect(host="mysql.zendurl.com",user="ie2",passwd="ad",db="ie2_zendurl1") cursor=db.cursor() crawler("http://www-www.hi5.com",3) #rezulatele ## cursor.execute("SELECT nume FROM tema2") ## result = cursor.fetchall() ## for record in result: ## print record