Files
osdict_project/spider/action_getwebdata.py

55 lines
1.3 KiB
Python
Raw Normal View History

2022-10-22 11:01:52 +08:00
from typing import Sequence
import time
import logging
from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import update
from wordspider import WordSpider
from models import WordData
import models
AMOUNT = 9500
def get_not_retrieve_word_list(session: Session) -> Sequence:
queryset = session.query(WordData).filter_by(has_retrieve=False)
return queryset
def get_data(session: Session, word_list: Sequence) -> None:
for word in word_list:
spider = WordSpider(word)
text = spider.parse_page()
if spider.success:
print("*", end="")
else:
print("F", end="")
continue
session.execute(
update(WordData)
.where(WordData.word == word)
.values(html=text, has_retrieve=True)
)
session.commit()
if __name__ == "__main__":
starttime = time.time()
logging.basicConfig(filename="spider.log")
with models.Session() as session:
ans = get_not_retrieve_word_list(session)
if ans.count() < AMOUNT:
queryset = ans
else:
queryset = ans[0:AMOUNT]
word_list = map(lambda item: item.word, queryset)
get_data(session, word_list)
endtime = time.time()
print(endtime - starttime)