This commit is contained in:
2022-10-22 11:01:52 +08:00
commit 200117b921
105 changed files with 26944 additions and 0 deletions

View File

@ -0,0 +1,54 @@
from typing import Sequence
import time
import logging
from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import update
from wordspider import WordSpider
from models import WordData
import models
AMOUNT = 9500
def get_not_retrieve_word_list(session: Session) -> Sequence:
queryset = session.query(WordData).filter_by(has_retrieve=False)
return queryset
def get_data(session: Session, word_list: Sequence) -> None:
for word in word_list:
spider = WordSpider(word)
text = spider.parse_page()
if spider.success:
print("*", end="")
else:
print("F", end="")
continue
session.execute(
update(WordData)
.where(WordData.word == word)
.values(html=text, has_retrieve=True)
)
session.commit()
if __name__ == "__main__":
starttime = time.time()
logging.basicConfig(filename="spider.log")
with models.Session() as session:
ans = get_not_retrieve_word_list(session)
if ans.count() < AMOUNT:
queryset = ans
else:
queryset = ans[0:AMOUNT]
word_list = map(lambda item: item.word, queryset)
get_data(session, word_list)
endtime = time.time()
print(endtime - starttime)