Thought this would be helpful in case anyone wants to create a local repository of items
E: Parser would be better terminology...
This script goes through all pages on either Blizzard's or Wowhead's site and retrieves all information about every item with an ID in the range start and finish and stores the information in a pickled dictionary.
Python 2.7
Code:
import asyncore
import string, socket
import StringIO
import mimetools, urlparse
import pickle
import re
start=1000
finish=100000
blizzard = "http://us.battle.net/wow/en/item/"
wowhead = "http://www.wowhead.com/item="
url=blizzard
class AsyncHTTP(asyncore.dispatcher_with_send):
# HTTP requestor
def __init__(self, uri, consumer):
asyncore.dispatcher_with_send.__init__(self)
self.uri = uri
self.consumer = consumer
# turn the uri into a valid request
scheme, host, path, params, query, fragment = urlparse.urlparse(uri)
assert scheme == "http", "only supports HTTP requests"
try:
host, port = string.split(host, ":", 1)
port = int(port)
except (TypeError, ValueError):
port = 80 # default port
if not path:
path = "/"
if params:
path = path + ";" + params
if query:
path = path + "?" + query
self.request = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (path, host)
self.host = host
self.port = port
self.status = None
self.header = None
self.data = ""
# get things going!
self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
self.connect((host, port))
def handle_connect(self):
# connection succeeded
self.send(self.request)
def handle_expt(self):
# connection failed; notify consumer (status is None)
self.close()
try:
http_header = self.consumer.http_header
except AttributeError:
pass
else:
http_header(self)
def handle_read(self):
data = self.recv(2048)
if not self.header:
self.data = self.data + data
try:
i = string.index(self.data, "\r\n\r\n")
except ValueError:
return # continue
else:
# parse header
fp = StringIO.StringIO(self.data[:i+4])
# status line is "HTTP/version status message"
status = fp.readline()
self.status = string.split(status, " ", 2)
# followed by a rfc822-style message header
self.header = mimetools.Message(fp)
# followed by a newline, and the payload (if any)
data = self.data[i+4:]
self.data = ""
# notify consumer (status is non-zero)
try:
http_header = self.consumer.http_header
except AttributeError:
pass
else:
http_header(self)
if not self.connected:
return # channel was closed by consumer
self.consumer.feed(data)
def handle_close(self):
#self.consumer.close()
self.close()
class DummyConsumer:
size = 0
text = ''
def http_header(self, request):
# handle header
if request.status is None:
print "connection failed"
def feed(self, data):
# handle incoming data
self.size = self.size + len(data)
self.text = self.text + data
#def close(self):
# end of data
#print self.size, "bytes in body"
#print self.text
#
# try it out
itemCounter = start
while itemCounter < finish:
consumer = DummyConsumer()
consumer.text = ''
request = AsyncHTTP(
"%s"%str(url)+str(itemCounter),
consumer
)
asyncore.loop()
print "%s"%str(url)+str(itemCounter)
itemCounter = itemCounter+1
itemDB = {}
log = open('log.txt','a')
x = consumer.text
if '<b class="q' in x:
print 'FOUND AN ITEM'
name = x.split('<b class="q')
x = x.replace(name[0], '')
name[1] = name[1].replace(name[1][0:3], '')
name = name[1].split('</b>')[0]
itemDB[name] = []
x = x.replace(name, '')
x = x.split("ge('icon")[0]
x = x.rstrip(' \t\n\r')
results = re.compile('>(.*?)<', re.DOTALL | re.IGNORECASE).findall(x)
for y in results:
if len(y) > 1 and '\n' not in y:
itemDB[name].append(y)
print 'Adding %s : item %s with attributes:'%(name, itemCounter)
log.write('Adding %s : item %s with attributes:'%(name, itemCounter))
for x in itemDB[name]:
print ' ' + x
log.write(' ' + x)
print '\n'
log.write('\n')
log.write("%s"%str(url)+str(itemCounter) + '\n')
log.close
log.close
str_path = open('itemdatabase.db', 'wb')
pickle.dump(itemDB, str_path)
str_path.close()
print "Complete and written to 'itemdatabase.db'!"