Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

I have the following python code that is supposed to extract data from http://ww

ID: 3839129 • Letter: I

Question

I have the following python code that is supposed to extract data from http://www.sherdog.com/organizations/Ultimate-Fighting-Championship-2 but the CSV file I create is blank. Can anyone fix so the data goes into the csv file?

from __future__ import print_function
import datetime
import scrapy

#Scrapy spider for crawling Sherdog fight database

fighters = {}
fights = set()

MAX_DEPTH = 20

fights_file = open('fights_%s.csv' %
                   datetime.date.today().strftime("%Y-%m-%d"), 'w')
fighter_file = open('fighter_file_%s.csv' %
                    datetime.date.today().strftime("%Y-%m-%d"), 'w')
                  
class SherdogSpider(scrapy.Spider):
    name = 'sherdogspider'
    start_urls = ['http://www.sherdog.com/organizations/Ultimate-Fighting-Championship-2']

    def parse(self, response):
        for url in response.xpath("//a/@href").extract():
            if url.startswith("/events/UFC-"):
                url = response.urljoin(url)
                print(url)
                yield scrapy.Request(url, self.parse_event)

    def parse_event(self, response):
        fighters_l = set()
        for s in response.xpath("//tr"):
            for t in s.xpath('td[contains(@class, "text")]'):
                fighter = t.xpath("div/a/@href").extract()
                if len(fighter) == 1:
                    fighters_l.add(fighter[0])
        for f in fighters_l:
            if fighters.has_key(f):
                continue
            req = scrapy.Request(response.urljoin(f), self.parse_fighter)
            req.meta["fighter"] = f
            req.meta["dpth"] = 0
            yield req
          

    def parse_fighter(self, response):     
        fighter = response.meta["fighter"]
        print (response.meta["dpth"], response.meta["fighter"], fighters.has_key(fighter))
        depth = response.meta["dpth"]
        if fighters.has_key(fighter):
            return
        wc = response.xpath("//h6/strong/text()").extract()
        if len(wc) == 1:
            wc = wc[0]
        else:
            wc = ""
        birthday = response.xpath('//span[contains (@itemprop,"birthDate")]/text()').extract()
        if len(birthday) == 1:
            birthday = birthday[0]
        else:
            birthday = ""
          
        fighters[fighter] = {"wc" : wc}
        fighters[fighter] = {"birthday" : birthday}
        fighter_file.write("%s %s %s " % (fighter, wc, birthday))
      
        for s in response.xpath("//tr"):
            res = s.xpath('td/span[contains(@class, "final")]/text()').extract()
            if res != []:
                try:
                    res = res[0]
                    tds = s.xpath("td")
                    opponent = tds[1].xpath("a/@href").extract()[0]
                    dt = tds[2].xpath("span/text()").extract()[0]
                    fight = tuple(sorted([fighter, opponent])), dt
                    if fight in fights:
                        continue
                    else:
                        fights.add(fight)
                    method = tds[3].xpath("text()").extract()[0]
                    round = tds[4].xpath("text()").extract()[0]
                    min = tds[5].xpath("text()").extract()[0]
                    data = [fighter, opponent, res, method, round, min, dt]
                    fights_file.write(" ".join([d.strip() for d in data]))
                    fights_file.write(" ")
                  
                    if (not fighters.has_key(opponent)) and response.meta["dpth"] < MAX_DEPTH:
                        req = scrapy.Request(response.urljoin(opponent), self.parse_fighter)
                        req.meta["fighter"] = opponent
                        req.meta["dpth"] = depth + 1
                        yield req                  
                except Exception, e:
                    pass

Explanation / Answer

CSV ie Comma Separated Values format is the most common format used for import and export of spreadsheets and databases.

Using the csv module we can implement classes to read and write tabular data in CSV format.

We can also read and write data in dictionary form using the DictReader and DictWriter classes.

When we write the data to a file it gets buffered. This takes some time and does not happen immedialely. For this the file descriptor should be closed first so that it flushes the unwritten data, if any.

We do not need to close the file with reader() method but we need to close the open() method. Since you are using open() method for both the files we need to close them as follows:

fights_file.close()

fighter_file.close()

We add these at the end of the script

from __future__ import print_function

import datetime

import scrapy

#Scrapy spider for crawling Sherdog fight database

fighters = {}

fights = set()

MAX_DEPTH = 20

fights_file = open('fights_%s.csv' %

                   datetime.date.today().strftime("%Y-%m-%d"), 'w')

fighter_file = open('fighter_file_%s.csv' %

                    datetime.date.today().strftime("%Y-%m-%d"), 'w')

                   

class SherdogSpider(scrapy.Spider):

    name = 'sherdogspider'

    start_urls = ['http://www.sherdog.com/organizations/Ultimate-Fighting-Championship-2']

    def parse(self, response):

        for url in response.xpath("//a/@href").extract():

            if url.startswith("/events/UFC-"):

                url = response.urljoin(url)

                print(url)

                yield scrapy.Request(url, self.parse_event)

    def parse_event(self, response):

        fighters_l = set()

        for s in response.xpath("//tr"):

            for t in s.xpath('td[contains(@class, "text")]'):

                fighter = t.xpath("div/a/@href").extract()

                if len(fighter) == 1:

                    fighters_l.add(fighter[0])

        for f in fighters_l:

            if fighters.has_key(f):

                continue

            req = scrapy.Request(response.urljoin(f), self.parse_fighter)

            req.meta["fighter"] = f

            req.meta["dpth"] = 0

            yield req

           

    def parse_fighter(self, response):      

        fighter = response.meta["fighter"]

        print (response.meta["dpth"], response.meta["fighter"], fighters.has_key(fighter))

        depth = response.meta["dpth"]

        if fighters.has_key(fighter):

            return

        wc = response.xpath("//h6/strong/text()").extract()

        if len(wc) == 1:

            wc = wc[0]

        else:

            wc = ""

        birthday = response.xpath('//span[contains (@itemprop,"birthDate")]/text()').extract()

        if len(birthday) == 1:

            birthday = birthday[0]

        else:

            birthday = ""

           

        fighters[fighter] = {"wc" : wc}

        fighters[fighter] = {"birthday" : birthday}

        fighter_file.write("%s %s %s " % (fighter, wc, birthday))

       

        for s in response.xpath("//tr"):

            res = s.xpath('td/span[contains(@class, "final")]/text()').extract()

            if res != []:

                try:

                    res = res[0]

                    tds = s.xpath("td")

                    opponent = tds[1].xpath("a/@href").extract()[0]

                    dt = tds[2].xpath("span/text()").extract()[0]

                    fight = tuple(sorted([fighter, opponent])), dt

                    if fight in fights:

                        continue

                    else:

                        fights.add(fight)

                    method = tds[3].xpath("text()").extract()[0]

                    round = tds[4].xpath("text()").extract()[0]

                    min = tds[5].xpath("text()").extract()[0]

                    data = [fighter, opponent, res, method, round, min, dt]

                    fights_file.write(" ".join([d.strip() for d in data]))

                    fights_file.write(" ")

                   

                    if (not fighters.has_key(opponent)) and response.meta["dpth"] < MAX_DEPTH:

                        req = scrapy.Request(response.urljoin(opponent), self.parse_fighter)

                        req.meta["fighter"] = opponent

                        req.meta["dpth"] = depth + 1

                        yield req                   

                finally:

                  fights_file.close()

                  fighter_file.close()