diff options
Diffstat (limited to 'NjuSpider/njubbs')
21 files changed, 0 insertions, 223 deletions
diff --git a/NjuSpider/njubbs/njubbs/__init__.py b/NjuSpider/njubbs/njubbs/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/NjuSpider/njubbs/njubbs/__init__.py +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/__init__.pyc b/NjuSpider/njubbs/njubbs/__init__.pyc Binary files differdeleted file mode 100644 index 1b1455b..0000000 --- a/NjuSpider/njubbs/njubbs/__init__.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/__pycache__/__init__.cpython-34.pyc b/NjuSpider/njubbs/njubbs/__pycache__/__init__.cpython-34.pyc Binary files differdeleted file mode 100644 index c403ab5..0000000 --- a/NjuSpider/njubbs/njubbs/__pycache__/__init__.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/__pycache__/items.cpython-34.pyc b/NjuSpider/njubbs/njubbs/__pycache__/items.cpython-34.pyc Binary files differdeleted file mode 100644 index 64081e6..0000000 --- a/NjuSpider/njubbs/njubbs/__pycache__/items.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/__pycache__/settings.cpython-34.pyc b/NjuSpider/njubbs/njubbs/__pycache__/settings.cpython-34.pyc Binary files differdeleted file mode 100644 index ad47ee5..0000000 --- a/NjuSpider/njubbs/njubbs/__pycache__/settings.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/items.py b/NjuSpider/njubbs/njubbs/items.py deleted file mode 100644 index 1857063..0000000 --- a/NjuSpider/njubbs/njubbs/items.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class njubbsItem(scrapy.Item): - title = scrapy.Field() - time = scrapy.Field() - author = scrapy.Field() - # define the fields for your item here like: - # name = scrapy.Field() - pass - -class QuoteItem(scrapy.Item): - text = scrapy.Field() - author = scrapy.Field() diff --git a/NjuSpider/njubbs/njubbs/items.pyc b/NjuSpider/njubbs/njubbs/items.pyc Binary files differdeleted file mode 100644 index 425f698..0000000 --- a/NjuSpider/njubbs/njubbs/items.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/pipelines.py b/NjuSpider/njubbs/njubbs/pipelines.py deleted file mode 100644 index 69a7f7a..0000000 --- a/NjuSpider/njubbs/njubbs/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html - - -class NjubbsPipeline(object): - def process_item(self, item, spider): - return item diff --git a/NjuSpider/njubbs/njubbs/settings.py b/NjuSpider/njubbs/njubbs/settings.py deleted file mode 100644 index cdec22d..0000000 --- a/NjuSpider/njubbs/njubbs/settings.py +++ /dev/null @@ -1,91 +0,0 @@ -# -*- coding: utf-8 -*- - -# Scrapy settings for njubbs project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'njubbs' - -SPIDER_MODULES = ['njubbs.spiders'] -NEWSPIDER_MODULE = 'njubbs.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'njubbs (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False -# ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'njubbs.middlewares.MyCustomSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'njubbs.middlewares.MyCustomDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'njubbs.pipelines.SomePipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/NjuSpider/njubbs/njubbs/settings.pyc b/NjuSpider/njubbs/njubbs/settings.pyc Binary files differdeleted file mode 100644 index d3ab4e4..0000000 --- a/NjuSpider/njubbs/njubbs/settings.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/spiders/2 b/NjuSpider/njubbs/njubbs/spiders/2 deleted file mode 100644 index 51833d2..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/2 +++ /dev/null @@ -1,21 +0,0 @@ -import json -import scrapy -from njubbs.items import njubbsItem - - -class njubbsSpider(scrapy.Spider): - name = "njubbs" - start_urls = [ - 'http://bbs.nju.edu.cn/cache/t_act.js', - ] - - def parse(self, response): - strRe = "\"".join(response.text.split("\'")) - print(strRe) - strRe = strRe[5:-1] - print(strRe) - # print(response.text) - for ncd in response.xpath('//div[@id=".p.ncd__act"]'): - item = njubbsItem() - yield ncd.xpath('td') - diff --git a/NjuSpider/njubbs/njubbs/spiders/__init__.py b/NjuSpider/njubbs/njubbs/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/NjuSpider/njubbs/njubbs/spiders/__init__.pyc b/NjuSpider/njubbs/njubbs/spiders/__init__.pyc Binary files differdeleted file mode 100644 index 5aa3307..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/__init__.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/spiders/__pycache__/__init__.cpython-34.pyc b/NjuSpider/njubbs/njubbs/spiders/__pycache__/__init__.cpython-34.pyc Binary files differdeleted file mode 100644 index 63868d8..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/__pycache__/__init__.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/spiders/__pycache__/jwSpider.cpython-34.pyc b/NjuSpider/njubbs/njubbs/spiders/__pycache__/jwSpider.cpython-34.pyc Binary files differdeleted file mode 100644 index 4aa8868..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/__pycache__/jwSpider.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/spiders/__pycache__/njubbsSpider.cpython-34.pyc b/NjuSpider/njubbs/njubbs/spiders/__pycache__/njubbsSpider.cpython-34.pyc Binary files differdeleted file mode 100644 index 70284f9..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/__pycache__/njubbsSpider.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/spiders/__pycache__/quoteSpider.cpython-34.pyc b/NjuSpider/njubbs/njubbs/spiders/__pycache__/quoteSpider.cpython-34.pyc Binary files differdeleted file mode 100644 index 0fd02c9..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/__pycache__/quoteSpider.cpython-34.pyc +++ /dev/null diff --git a/NjuSpider/njubbs/njubbs/spiders/jwSpider.py b/NjuSpider/njubbs/njubbs/spiders/jwSpider.py deleted file mode 100644 index 6bf39ba..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/jwSpider.py +++ /dev/null @@ -1,28 +0,0 @@ -import scrapy -from njubbs.items import njubbsItem - -class njubbsSpider(scrapy.Spider): - name = "njujw" - start_urls = [ - 'http://jw.nju.edu.cn/', - ] - - def parse(self, response): - # print(response.text) - news = response.xpath('//div[@class="conbox1"]/div[@class="con1"]/ul/li') - print(news) - for li in news: - title = li.xpath('a/@title').extract_first() - print('\n') - url = li.xpath('a/@href').extract_first() - print(title) - print(response.urljoin(url)) - print("\n") - - - def articleParse(self, response): - title = response.xpath('//div[@class="div_detail"]/div[@class="div_title"]/center').extract_first() - - print(title) - contengt = response.xpath('//div[@class="div_detail"]/dev[@class="div_contentDetail"]') - diff --git a/NjuSpider/njubbs/njubbs/spiders/njubbsSpider.py b/NjuSpider/njubbs/njubbs/spiders/njubbsSpider.py deleted file mode 100644 index de761cd..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/njubbsSpider.py +++ /dev/null @@ -1,22 +0,0 @@ -import json -import scrapy -from njubbs.items import njubbsItem - - -class njubbsSpider(scrapy.Spider): - name = "njubbs" - start_urls = [ - 'http://bbs.nju.edu.cn/cache/t_act.js', - ] - - def parse(self, response): - strRe = "\"".join(response.text.split("\'")) - print(strRe) - strRe = strRe[5:-24] - jsonStr = json.JSONEncoder().encode(strRe) - print(strRe) - # print(response.text) - for ncd in response.xpath('//div[@id=".p.ncd__act"]'): - item = njubbsItem() - yield ncd.xpath('td') - diff --git a/NjuSpider/njubbs/njubbs/spiders/quoteSpider.py b/NjuSpider/njubbs/njubbs/spiders/quoteSpider.py deleted file mode 100644 index 66f86ea..0000000 --- a/NjuSpider/njubbs/njubbs/spiders/quoteSpider.py +++ /dev/null @@ -1,14 +0,0 @@ -import scrapy -from njubbs.items import QuoteItem -class QuotesSpider(scrapy.Spider): - name = "quotes" - start_urls = [ - 'http://quotes.toscrape.com/page/1/', - 'http://quotes.toscrape.com/page/2/', - ] -def parse(self, response): - for quote in response.xpath('//div[@class="quote"]'): - item = QuoteItem() - item['text'] = quote.xpath('span[@class="text"]/text()').extract_first() - item['author'] = quote.xpath('span/small/text()').extract_first() - yield item diff --git a/NjuSpider/njubbs/scrapy.cfg b/NjuSpider/njubbs/scrapy.cfg deleted file mode 100644 index 0aba25d..0000000 --- a/NjuSpider/njubbs/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html - -[settings] -default = njubbs.settings - -[deploy] -#url = http://localhost:6800/ -project = njubbs |
