I have the following python code that is supposed to extract data from http://ww
ID: 3839129 • Letter: I
Question
I have the following python code that is supposed to extract data from http://www.sherdog.com/organizations/Ultimate-Fighting-Championship-2 but the CSV file I create is blank. Can anyone fix so the data goes into the csv file?
from __future__ import print_function
import datetime
import scrapy
#Scrapy spider for crawling Sherdog fight database
fighters = {}
fights = set()
MAX_DEPTH = 20
fights_file = open('fights_%s.csv' %
datetime.date.today().strftime("%Y-%m-%d"), 'w')
fighter_file = open('fighter_file_%s.csv' %
datetime.date.today().strftime("%Y-%m-%d"), 'w')
class SherdogSpider(scrapy.Spider):
name = 'sherdogspider'
start_urls = ['http://www.sherdog.com/organizations/Ultimate-Fighting-Championship-2']
def parse(self, response):
for url in response.xpath("//a/@href").extract():
if url.startswith("/events/UFC-"):
url = response.urljoin(url)
print(url)
yield scrapy.Request(url, self.parse_event)
def parse_event(self, response):
fighters_l = set()
for s in response.xpath("//tr"):
for t in s.xpath('td[contains(@class, "text")]'):
fighter = t.xpath("div/a/@href").extract()
if len(fighter) == 1:
fighters_l.add(fighter[0])
for f in fighters_l:
if fighters.has_key(f):
continue
req = scrapy.Request(response.urljoin(f), self.parse_fighter)
req.meta["fighter"] = f
req.meta["dpth"] = 0
yield req
def parse_fighter(self, response):
fighter = response.meta["fighter"]
print (response.meta["dpth"], response.meta["fighter"], fighters.has_key(fighter))
depth = response.meta["dpth"]
if fighters.has_key(fighter):
return
wc = response.xpath("//h6/strong/text()").extract()
if len(wc) == 1:
wc = wc[0]
else:
wc = ""
birthday = response.xpath('//span[contains (@itemprop,"birthDate")]/text()').extract()
if len(birthday) == 1:
birthday = birthday[0]
else:
birthday = ""
fighters[fighter] = {"wc" : wc}
fighters[fighter] = {"birthday" : birthday}
fighter_file.write("%s %s %s " % (fighter, wc, birthday))
for s in response.xpath("//tr"):
res = s.xpath('td/span[contains(@class, "final")]/text()').extract()
if res != []:
try:
res = res[0]
tds = s.xpath("td")
opponent = tds[1].xpath("a/@href").extract()[0]
dt = tds[2].xpath("span/text()").extract()[0]
fight = tuple(sorted([fighter, opponent])), dt
if fight in fights:
continue
else:
fights.add(fight)
method = tds[3].xpath("text()").extract()[0]
round = tds[4].xpath("text()").extract()[0]
min = tds[5].xpath("text()").extract()[0]
data = [fighter, opponent, res, method, round, min, dt]
fights_file.write(" ".join([d.strip() for d in data]))
fights_file.write(" ")
if (not fighters.has_key(opponent)) and response.meta["dpth"] < MAX_DEPTH:
req = scrapy.Request(response.urljoin(opponent), self.parse_fighter)
req.meta["fighter"] = opponent
req.meta["dpth"] = depth + 1
yield req
except Exception, e:
pass
Explanation / Answer
CSV ie Comma Separated Values format is the most common format used for import and export of spreadsheets and databases.
Using the csv module we can implement classes to read and write tabular data in CSV format.
We can also read and write data in dictionary form using the DictReader and DictWriter classes.
When we write the data to a file it gets buffered. This takes some time and does not happen immedialely. For this the file descriptor should be closed first so that it flushes the unwritten data, if any.
We do not need to close the file with reader() method but we need to close the open() method. Since you are using open() method for both the files we need to close them as follows:
fights_file.close()
fighter_file.close()
We add these at the end of the script
from __future__ import print_function
import datetime
import scrapy
#Scrapy spider for crawling Sherdog fight database
fighters = {}
fights = set()
MAX_DEPTH = 20
fights_file = open('fights_%s.csv' %
datetime.date.today().strftime("%Y-%m-%d"), 'w')
fighter_file = open('fighter_file_%s.csv' %
datetime.date.today().strftime("%Y-%m-%d"), 'w')
class SherdogSpider(scrapy.Spider):
name = 'sherdogspider'
start_urls = ['http://www.sherdog.com/organizations/Ultimate-Fighting-Championship-2']
def parse(self, response):
for url in response.xpath("//a/@href").extract():
if url.startswith("/events/UFC-"):
url = response.urljoin(url)
print(url)
yield scrapy.Request(url, self.parse_event)
def parse_event(self, response):
fighters_l = set()
for s in response.xpath("//tr"):
for t in s.xpath('td[contains(@class, "text")]'):
fighter = t.xpath("div/a/@href").extract()
if len(fighter) == 1:
fighters_l.add(fighter[0])
for f in fighters_l:
if fighters.has_key(f):
continue
req = scrapy.Request(response.urljoin(f), self.parse_fighter)
req.meta["fighter"] = f
req.meta["dpth"] = 0
yield req
def parse_fighter(self, response):
fighter = response.meta["fighter"]
print (response.meta["dpth"], response.meta["fighter"], fighters.has_key(fighter))
depth = response.meta["dpth"]
if fighters.has_key(fighter):
return
wc = response.xpath("//h6/strong/text()").extract()
if len(wc) == 1:
wc = wc[0]
else:
wc = ""
birthday = response.xpath('//span[contains (@itemprop,"birthDate")]/text()').extract()
if len(birthday) == 1:
birthday = birthday[0]
else:
birthday = ""
fighters[fighter] = {"wc" : wc}
fighters[fighter] = {"birthday" : birthday}
fighter_file.write("%s %s %s " % (fighter, wc, birthday))
for s in response.xpath("//tr"):
res = s.xpath('td/span[contains(@class, "final")]/text()').extract()
if res != []:
try:
res = res[0]
tds = s.xpath("td")
opponent = tds[1].xpath("a/@href").extract()[0]
dt = tds[2].xpath("span/text()").extract()[0]
fight = tuple(sorted([fighter, opponent])), dt
if fight in fights:
continue
else:
fights.add(fight)
method = tds[3].xpath("text()").extract()[0]
round = tds[4].xpath("text()").extract()[0]
min = tds[5].xpath("text()").extract()[0]
data = [fighter, opponent, res, method, round, min, dt]
fights_file.write(" ".join([d.strip() for d in data]))
fights_file.write(" ")
if (not fighters.has_key(opponent)) and response.meta["dpth"] < MAX_DEPTH:
req = scrapy.Request(response.urljoin(opponent), self.parse_fighter)
req.meta["fighter"] = opponent
req.meta["dpth"] = depth + 1
yield req
finally:
fights_file.close()
fighter_file.close()