python爬虫爬取安居客房价

python爬虫爬取安居客房价

  • 作者:Geticsen
  • 时间:2020-01-19
  • 917人已阅读
简介 python爬取安居客上深圳的房价

先来展示数据分析的成果:

1.词云图

image.png

image.png



2柱状图

image.png

3.饼状图

image.png

image.png

4.散点图,折线图

image.png

image.png

爬虫的编写

我使用的是scrapy框架,不会使用的建议直接去看教程下面是爬虫的主代码以及数据库的保存代码:

1.爬虫代码:

import scrapy

class anjuke(scrapy.Spider):
    name = "anjuke"
    city = "深圳"
    area = "shenzhenzhoubian"
    def start_requests(self):
        urls = ["https://shenzhen.anjuke.com/sale/shenzhenzhoubian/"]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        all_house = response.xpath('//li[@class="list-item"]')
        next_url = response.xpath("//div[@class='multi-page']/a[@class='aNxt']/@href")
        for one_house in all_house:
            # title = one_house.xpath('.//div[@class="house-title"]/a/text()').extract()[0]
            # house = response.xpath('.//div[@class="details-item"][1]/span[1]/text()').extract()[0]
            # house_big = response.xpath('.//div[@class="details-item"][1]/span[2]/text()').extract()[0]
            # house_build_year = response.xpath('.//div[@class="details-item"][1]/span[4]/text()').extract()[0]
            # house_location = response.xpath('.//div[@class="details-item"][2]/span/text()').extract()[0]
            # total_price = response.xpath('.//div[@class="pro-price"]/span/strong/text()').extract()[0]+"万"
            # unit_price = response.xpath('.//div[@class="pro-price"]/span[2]/text()').extract()[0]
            # print(one_house.xpath('.//div[@class="house-title"]/a/text()').extract()[0])
            Item_one = {
                "title" : one_house.xpath('.//div[@class="house-title"]/a/text()').extract()[0],
                "house" : one_house.xpath('.//div[@class="details-item"][1]/span[1]/text()').extract()[0],
                "house_big" : one_house.xpath('.//div[@class="details-item"][1]/span[2]/text()').extract()[0],
                "house_build_year" : one_house.xpath('.//div[@class="details-item"][1]/span[4]/text()').extract()[0],
                "house_location" : one_house.xpath('.//div[@class="details-item"][2]/span/text()').extract()[0],
                "total_price" :one_house.xpath('.//div[@class="pro-price"]/span/strong/text()').extract()[0] + "万",
                "unit_price" : one_house.xpath('.//div[@class="pro-price"]/span[2]/text()').extract()[0],
                "city" : self.city,
                "area" : self.area
            }
            #print(Ietm_one)
            yield  Item_one
        if next_url:
            url = next_url.extract()[0]
            yield scrapy.Request(url=url, callback=self.parse)

2.数据库保存代码:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
from twisted.enterprise import adbapi

class AnjukeScrapyPipeline(object):
    # def __init__(self, dbpool):
    #     self.dbpool = dbpool
    # def process_item(self, item, spider):
    #     """
    #         使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
    #     """
    #     query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
    #     # 添加异常处理
    #     query.addCallback(self.handle_error)  # 处理异常
    #
    # def do_insert(self, cursor, item):
    #     # 对数据库进行插入操作,并不需要commit,twisted会自动commit
    #     # title = scrapy.Field()
    #     # house = scrapy.Field()
    #     # house_big = scrapy.Field()
    #     # house_build_year = scrapy.Field()
    #     # house_location = scrapy.Field()
    #     # total_price = scrapy.Field()
    #     # unit_price = scrapy.Field()
    #     # city = scrapy.Field()
    #     # area = scrapy.Field()
    #     insert_sql = """
    #             insert into anjuke(title,house,house_big,house_build_year,house_location,total_price,unit_price,city,area) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #                  """
    #     #pymysql.escape_string
    #     cursor.execute(insert_sql, (item['title'], item['house'], item['house_big'], item['house_build_year'],
    #                                     item['house_location'],item['total_price'],item['unit_price'],item['city'],item['area']))
    #
    # def handle_error(self, failure):
    #     if failure:
    #         # 打印错误信息
    #         print(failure)
    # @classmethod
    # def from_settings(cls, settings):  # 函数名固定,会被scrapy调用,直接可用settings的值
    #     """
    #     数据库建立连接
    #     :param settings: 配置参数
    #     :return: 实例化参数
    #     """
    #     adbparams = dict(
    #         host=settings['MYSQL_HOST'],
    #         db=settings['MYSQL_DBNAME'],
    #         user=settings['MYSQL_USER'],
    #         password=settings['MYSQL_PASSWORD'],
    #         cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
    #     )
    #     # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
    #     dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
    #     # 返回实例化参数
    #     return cls(dbpool)

    """
        同步操作
    """

    def __init__(self):
        # 建立连接
        self.conn = pymysql.connect('localhost', 'root', '286348794zz', "crawling_learning")  # 有中文要存入数据库的话要加charset='utf8'
        # 创建游标
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # sql语句
        insert_sql = """
                      insert into 
                      anjuke(title,house,house_big,house_build_year,house_location,total_price,unit_price,city,area) 
                      VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)
                           """
        # pymysql.escape_string
        self.cursor.execute(insert_sql,(item['title'], item['house'], item['house_big'], item['house_build_year'],
                                    item['house_location'], item['total_price'], item['unit_price'], item['city'],
                                    item['area']))
        # 提交,不进行提交无法保存到数据库
        self.conn.commit()

    def close_spider(self, spider):
        # 关闭游标和连接
        self.cursor.close()
        self.conn.close()

3.设置文件:

# -*- coding: utf-8 -*-

# Scrapy settings for anjuke_scrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'anjuke_scrapy'

SPIDER_MODULES = ['anjuke_scrapy.spiders']
NEWSPIDER_MODULE = 'anjuke_scrapy.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'anjuke_scrapy (+http://www.yourdomain.com)'
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'anjuke_scrapy.middlewares.AnjukeScrapySpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'anjuke_scrapy.middlewares.AnjukeScrapyDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'anjuke_scrapy.pipelines.AnjukeScrapyPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 3
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

#数据库配置
MYSQL_HOST = "localhost"
MYSQL_DBNAME = "你的数据库名"
MYSQL_USER = "root"
MYSQL_PASSWORD ="你的密码"


文章评论

Top