init
This commit is contained in:
54
spider/action_getwebdata.py
Normal file
54
spider/action_getwebdata.py
Normal file
@ -0,0 +1,54 @@
|
||||
from typing import Sequence
|
||||
import time
|
||||
import logging
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.sql.expression import update
|
||||
|
||||
from wordspider import WordSpider
|
||||
from models import WordData
|
||||
import models
|
||||
|
||||
|
||||
AMOUNT = 9500
|
||||
|
||||
|
||||
def get_not_retrieve_word_list(session: Session) -> Sequence:
|
||||
queryset = session.query(WordData).filter_by(has_retrieve=False)
|
||||
return queryset
|
||||
|
||||
|
||||
def get_data(session: Session, word_list: Sequence) -> None:
|
||||
for word in word_list:
|
||||
spider = WordSpider(word)
|
||||
text = spider.parse_page()
|
||||
if spider.success:
|
||||
print("*", end="")
|
||||
else:
|
||||
print("F", end="")
|
||||
continue
|
||||
session.execute(
|
||||
update(WordData)
|
||||
.where(WordData.word == word)
|
||||
.values(html=text, has_retrieve=True)
|
||||
)
|
||||
session.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
starttime = time.time()
|
||||
logging.basicConfig(filename="spider.log")
|
||||
with models.Session() as session:
|
||||
ans = get_not_retrieve_word_list(session)
|
||||
|
||||
if ans.count() < AMOUNT:
|
||||
queryset = ans
|
||||
else:
|
||||
queryset = ans[0:AMOUNT]
|
||||
|
||||
word_list = map(lambda item: item.word, queryset)
|
||||
|
||||
get_data(session, word_list)
|
||||
|
||||
endtime = time.time()
|
||||
print(endtime - starttime)
|
||||
Reference in New Issue
Block a user