init
This commit is contained in:
56
spider/wordspider.py
Normal file
56
spider/wordspider.py
Normal file
@ -0,0 +1,56 @@
|
||||
import logging
|
||||
from fake_useragent import UserAgent
|
||||
import httpx
|
||||
from parsel import Selector
|
||||
from htmlmin import minify
|
||||
|
||||
|
||||
def get_content(text: str) -> str:
|
||||
selector = Selector(text)
|
||||
selector = selector.css(".results-content")
|
||||
selector.css("#examples").remove()
|
||||
selector.css("#webTrans").remove()
|
||||
|
||||
selector.css("script").remove()
|
||||
selector.css("style").remove()
|
||||
selector.css("a").remove()
|
||||
|
||||
selector.css(".img-list").remove()
|
||||
return selector.get()
|
||||
|
||||
|
||||
class WordSpider:
|
||||
def __init__(self, word: str) -> None:
|
||||
self.useragent = UserAgent()
|
||||
self.headers = {"User-Agent": self.useragent.random}
|
||||
# self.headers = {}
|
||||
self.data = []
|
||||
self.word = word
|
||||
self.__html = ''
|
||||
self.url = "http://www.youdao.com/w/eng/{}/".format(word)
|
||||
self.success = None
|
||||
|
||||
def get_html(self) -> str:
|
||||
try:
|
||||
response = httpx.get(self.url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logging.error("fail. " + e)
|
||||
self.success = False
|
||||
self.__html = minify(response.text, True, True)
|
||||
self.success = True
|
||||
return self.__html
|
||||
|
||||
@property
|
||||
def html(self) -> str:
|
||||
if self.__html == "":
|
||||
self.get_html()
|
||||
return self.__html
|
||||
|
||||
def parse_page(self):
|
||||
return get_content(self.html)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
word = WordSpider("a")
|
||||
print(word.parse_page())
|
||||
Reference in New Issue
Block a user