Quantcast WoWInterface - View Single Post - Database crawler
View Single Post
03-10-14, 01:42 AM   #1
xxauroraxx
A Murloc Raider
Join Date: Mar 2014
Posts: 5
Database crawler

Thought this would be helpful in case anyone wants to create a local repository of items

E: Parser would be better terminology...

This script goes through all pages on either Blizzard's or Wowhead's site and retrieves all information about every item with an ID in the range start and finish and stores the information in a pickled dictionary.

Python 2.7
Code:
import asyncore
import string, socket
import StringIO
import mimetools, urlparse
import pickle
import re

start=1000
finish=100000
blizzard = "http://us.battle.net/wow/en/item/"
wowhead = "http://www.wowhead.com/item="
url=blizzard

class AsyncHTTP(asyncore.dispatcher_with_send):
    # HTTP requestor

    def __init__(self, uri, consumer):
        asyncore.dispatcher_with_send.__init__(self)

        self.uri = uri
        self.consumer = consumer

        # turn the uri into a valid request
        scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
        assert scheme == "http", "only supports HTTP requests"
        try:
            host, port = string.split(host, ":", 1)
            port = int(port)
        except (TypeError, ValueError):
            port = 80 # default port
        if not path:
            path = "/"
        if params:
            path = path + ";" + params
        if query:
            path = path + "?" + query

        self.request = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (path, host)

        self.host = host
        self.port = port

        self.status = None
        self.header = None

        self.data = ""

        # get things going!
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect((host, port))

    def handle_connect(self):
        # connection succeeded
        self.send(self.request)

    def handle_expt(self):
        # connection failed; notify consumer (status is None)
        self.close()
        try:
            http_header = self.consumer.http_header
        except AttributeError:
            pass
        else:
            http_header(self)

    def handle_read(self):
        data = self.recv(2048)
        if not self.header:
            self.data = self.data + data
            try:
                i = string.index(self.data, "\r\n\r\n")
            except ValueError:
                return # continue
            else:
                # parse header
                fp = StringIO.StringIO(self.data[:i+4])
                # status line is "HTTP/version status message"
                status = fp.readline()
                self.status = string.split(status, " ", 2)
                # followed by a rfc822-style message header
                self.header = mimetools.Message(fp)
                # followed by a newline, and the payload (if any)
                data = self.data[i+4:]
                self.data = ""
                # notify consumer (status is non-zero)
                try:
                    http_header = self.consumer.http_header
                except AttributeError:
                    pass
                else:
                    http_header(self)
                if not self.connected:
                    return # channel was closed by consumer

        self.consumer.feed(data)

    def handle_close(self):
        #self.consumer.close()
        self.close()

class DummyConsumer:
    size = 0
    text = ''

    def http_header(self, request):
        # handle header
        if request.status is None:
            print "connection failed"

    def feed(self, data):
        # handle incoming data
        self.size = self.size + len(data)
        self.text = self.text + data

    #def close(self):
        # end of data
        #print self.size, "bytes in body"
        #print self.text

#
# try it out

itemCounter = start
while itemCounter < finish:
    consumer = DummyConsumer()
    consumer.text = ''
    request = AsyncHTTP(
        "%s"%str(url)+str(itemCounter),
        consumer
        )

    asyncore.loop()
    print "%s"%str(url)+str(itemCounter)
    itemCounter = itemCounter+1
    itemDB = {}
    log = open('log.txt','a')

    x = consumer.text
    if '<b class="q' in x:
        print 'FOUND AN ITEM'
        name = x.split('<b class="q')
        x = x.replace(name[0], '')
        name[1] = name[1].replace(name[1][0:3], '')
        name = name[1].split('</b>')[0]        
        itemDB[name] = []
        x = x.replace(name, '')
        x = x.split("ge('icon")[0]
        x = x.rstrip(' \t\n\r')
        results = re.compile('>(.*?)<', re.DOTALL | re.IGNORECASE).findall(x)
        for y in results:
            if len(y) > 1 and '\n' not in y:
                itemDB[name].append(y)
        print 'Adding %s : item %s with attributes:'%(name, itemCounter)
        log.write('Adding %s : item %s with attributes:'%(name, itemCounter))
        for x in itemDB[name]:
            print ' ' + x
            log.write(' ' + x)
        print '\n'
        log.write('\n')
        
    log.write("%s"%str(url)+str(itemCounter) + '\n')
    log.close
            
log.close
str_path = open('itemdatabase.db', 'wb')
pickle.dump(itemDB, str_path)
str_path.close()
print "Complete and written to 'itemdatabase.db'!"

Last edited by xxauroraxx : 03-10-14 at 01:47 AM.
  Reply With Quote