Skip to content

Commit 6b6d802

Browse files
committed
[update] crawl/crawler_toyo_3.py
1 parent 15cb8e5 commit 6b6d802

File tree

3 files changed

+257
-2
lines changed

3 files changed

+257
-2
lines changed

crawl/crawler_toyo_3.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
import re
1313
import pandas as pd
1414
from bs4 import BeautifulSoup
15+
import json
1516

1617
from logger_setup import *
18+
import lib_mongo_atlas
1719

1820
strabspath=os.path.abspath(sys.argv[0])
1921
strdirname=os.path.dirname(strabspath)
@@ -190,12 +192,30 @@ def filter_jobelement(in_info, opt_verbose= 'OFF'):
190192
msg = 'Start Time is {}/{}/{} {}:{}:{}'
191193
logger.info(msg.format( local_time.tm_year,local_time.tm_mon,local_time.tm_mday,\
192194
local_time.tm_hour,local_time.tm_min,local_time.tm_sec))
195+
if len(sys.argv) != 2:
196+
msg = 'Please input config json file!!! '
197+
logger.info(msg)
198+
sys.exit()
199+
200+
json_file= sys.argv[1]
201+
202+
if (not os.path.isfile(json_file)) :
203+
msg = 'Please check json file:{} if exist!!! '
204+
logger.info(msg.format(json_file) )
205+
sys.exit()
206+
207+
with open(json_file, encoding="utf-8") as f:
208+
json_data = json.load(f)
209+
210+
opt_verbose='ON'
211+
212+
db, collection= lib_mongo_atlas.mongodb_conn(json_data, opt_verbose)
193213

194214
# 爬蟲參數設定
195215
# 搜尋關鍵詞
196216
keyword = 'python '
197217
# 搜尋最大頁數
198-
maxPage = 20
218+
maxPage = 1
199219

200220
filter_params = {
201221
'area': '6001006000', # (地區) 6001001000, 台北市,新竹縣市
@@ -265,7 +285,33 @@ def filter_jobelement(in_info, opt_verbose= 'OFF'):
265285
# 輸出csv檔案
266286
#fileName = now.strftime('%Y%m%d%H%M%S') + '104人力銀行_' + keyword + '_爬蟲搜尋結果.csv'
267287
fileName = '104人力銀行_' + keyword + '_爬蟲搜尋結果'+ now.strftime('%Y%m%d')+'.csv'
268-
outputDf.to_csv(fileName, encoding='utf-8-sig')
288+
#outputDf.to_csv(fileName, encoding='utf-8-sig')
289+
"""
290+
Convert a Pandas DataFrame to a dictionary
291+
https://stackoverflow.com/questions/26716616/convert-a-pandas-dataframe-to-a-dictionary
292+
293+
>>> df = pd.DataFrame({'a': ['red', 'yellow', 'blue'], 'b': [0.5, 0.25, 0.125]})
294+
>>> df
295+
a b
296+
0 red 0.500
297+
1 yellow 0.250
298+
2 blue 0.125
299+
300+
>>> df.to_dict('records')
301+
[{'a': 'red', 'b': 0.5},
302+
{'a': 'yellow', 'b': 0.25},
303+
{'a': 'blue', 'b': 0.125}]
304+
"""
305+
306+
lib_mongo_atlas.mongodb_inser_tmany(db, collection, outputDf.to_dict('records'), ordered=False, opt_verbose=opt_verbose)
307+
308+
"""if opt_verbose.lower() == 'on':
309+
#msg = 'outputDf: {}'
310+
#logger.info(msg.format( outputDf))
311+
312+
msg = 'outputDf.to_dict(\'records\'): {}'
313+
logger.info(msg.format( outputDf.to_dict('records')))
314+
"""
269315

270316
msg = 'Time duration: {:.2f} seconds.'
271317
logger.info(msg.format( time.time() - t0))

crawl/lib_mongo_atlas.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
from pymongo import MongoClient
2+
import urllib.parse
3+
4+
from logger_setup import *
5+
6+
__all__ = [
7+
'mongodb_conn',
8+
'mongodb_insert_many',
9+
'mongodb_search',
10+
'mongodb_delete_many',
11+
]
12+
13+
def mongodb_conn(json_data, opt_verbose='OFF'):
14+
url= json_data["url"]
15+
username=urllib.parse.quote_plus(json_data["username"])
16+
password=urllib.parse.quote_plus(json_data["password"])
17+
conn_url= url.format(username,password)
18+
19+
if opt_verbose.lower() == 'on':
20+
msg = '\n conn_url: {}'
21+
logger.info(msg.format(conn_url))
22+
23+
try:
24+
client=MongoClient(conn_url)
25+
db=client[json_data["db"]]
26+
coll=db[json_data["collection"]]
27+
#db = client.test
28+
print("connected successfully!!")
29+
30+
return db, coll
31+
except:
32+
print("Sorry!connection failed!!")
33+
'''
34+
MongoDB insertMany and skip duplicates
35+
https://stackoverflow.com/questions/61480444/mongodb-insertmany-and-skip-duplicates
36+
'''
37+
def mongodb_insert_many(db, coll, list_collections, ordered= True, opt_verbose='OFF'):
38+
39+
if opt_verbose.lower() == 'on':
40+
msg = '\n list_collections: {}'
41+
logger.info(msg.format(list_collections))
42+
43+
coll.insert_many(list_collections, ordered=ordered )
44+
45+
def mongodb_search(db, coll, list_search_dicts, opt_verbose='OFF'):
46+
list_targets= []
47+
dict_targets= {}
48+
49+
if len(list_search_dicts) == 0:
50+
return list_targets
51+
52+
if opt_verbose.lower() == 'on':
53+
msg = '\n list_search_dicts: {}'
54+
logger.info(msg.format(list_search_dicts))
55+
56+
for search_dict in list_search_dicts:
57+
for key, value in search_dict.items():
58+
# Query the json dcoument based on equality condition
59+
for i in coll.find({key: value}):
60+
#print(i)
61+
list_targets.append( {key: value} )
62+
dict_targets.update({key: value})
63+
64+
return list_targets, dict_targets
65+
'''
66+
Python Mongodb - Delete_many()
67+
https://acervolima.com/python-mongodb-delete_many/
68+
69+
import pymongo
70+
client = pymongo.MongoClient("mongodb://localhost:27017/")
71+
mydb = client["GFG"]
72+
col = mydb["Geeks"]
73+
74+
query = {"Name": {"$regex": "^A"}}
75+
d = col.delete_many(query)
76+
print(d.deleted_count, " documents deleted !!")
77+
'''
78+
'''
79+
How to use "hint" parameter with pymongo's delete_many()
80+
81+
https://stackoverflow.com/questions/69921904/how-to-use-hint-parameter-with-pymongos-delete-many
82+
83+
hint in DeleteMany is only supported on MongoDB 4.4 and above.
84+
85+
@app.get("/delete/{id}")
86+
async def root(id: int):
87+
db = get_database()
88+
our_filter = { 'competitionId': { '$in': [30629, 30630] } }
89+
our_hint = [('competitionId', 1)]
90+
c = db['key'].delete_many(filter = our_filter,hint = our_hint)
91+
return {"message": c.deleted_count}
92+
'''
93+
94+
def mongodb_delete_many(db, coll, dict_filter= {}, list_hint= [], opt_verbose='OFF'):
95+
96+
if opt_verbose.lower() == 'on':
97+
msg = '\n dict_filter: {};\n list_hint: {}'
98+
logger.info(msg.format(dict_filter, list_hint))
99+
100+
#c = coll.delete_many(filter = dict_filter,hint = list_hint)
101+
c = coll.delete_many(filter = dict_filter)
102+
return {"message": c.deleted_count}

crawl/requirements.txt

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
appdirs==1.4.4
2+
asttokens==2.0.8
3+
attrs==22.1.0
4+
Automat==20.2.0
5+
backcall==0.2.0
6+
beautifulsoup4==4.11.1
7+
Brotli==1.0.9
8+
bs4==0.0.1
9+
cachelib==0.9.0
10+
certifi==2022.6.15
11+
cffi==1.15.1
12+
charset-normalizer==2.1.0
13+
chart-studio==1.1.0
14+
click==8.0.1
15+
colorama==0.4.4
16+
constantly==15.1.0
17+
cryptography==37.0.4
18+
cssselect==1.1.0
19+
dash==1.20.0
20+
dash-auth==1.4.1
21+
dash-bootstrap-components==0.12.2
22+
dash-core-components==1.16.0
23+
dash-html-components==1.1.3
24+
dash-renderer==1.9.1
25+
dash-table==4.11.3
26+
decorator==5.1.1
27+
dnspython==2.2.1
28+
et-xmlfile==1.1.0
29+
executing==1.0.0
30+
fake-useragent==0.1.11
31+
feedparser==6.0.10
32+
filelock==3.8.0
33+
Flask==2.0.1
34+
Flask-Caching==2.0.1
35+
Flask-Compress==1.10.1
36+
Flask-SeaSurf==1.1.1
37+
future==0.18.2
38+
gunicorn==20.1.0
39+
hyperlink==21.0.0
40+
idna==3.3
41+
importlib-metadata==4.12.0
42+
incremental==21.3.0
43+
ipython==8.4.0
44+
itemadapter==0.7.0
45+
itemloaders==1.0.6
46+
itsdangerous==2.0.1
47+
jedi==0.18.1
48+
Jinja2==3.0.1
49+
jmespath==1.0.1
50+
lxml==4.9.1
51+
MarkupSafe==2.0.1
52+
matplotlib-inline==0.1.6
53+
multitasking==0.0.11
54+
numpy==1.20.3
55+
openpyxl==3.0.10
56+
pandas==1.2.4
57+
pandas-datareader==0.10.0
58+
parse==1.19.0
59+
parsel==1.6.0
60+
parso==0.8.3
61+
pickleshare==0.7.5
62+
plotly==4.14.3
63+
prompt-toolkit==3.0.31
64+
Protego==0.2.1
65+
pure-eval==0.2.2
66+
pyasn1==0.4.8
67+
pyasn1-modules==0.2.8
68+
pycparser==2.21
69+
PyDispatcher==2.0.6
70+
pyee==8.2.2
71+
Pygments==2.13.0
72+
pymongo==4.2.0
73+
pyOpenSSL==22.0.0
74+
pyppeteer==1.0.2
75+
pyquery==1.4.3
76+
python-dateutil==2.8.1
77+
pytz==2021.1
78+
queuelib==1.6.2
79+
requests==2.28.1
80+
requests-file==1.5.1
81+
requests-html==0.10.0
82+
retrying==1.3.3
83+
Scrapy==2.6.2
84+
service-identity==21.1.0
85+
sgmllib3k==1.0.0
86+
six==1.16.0
87+
soupsieve==2.3.2.post1
88+
stack-data==0.5.0
89+
stockstats==0.4.1
90+
TA-Lib @ file:///D:/tools/TA_Lib-0.4.24-cp38-cp38-win_amd64.whl
91+
tldextract==3.3.1
92+
tqdm==4.64.0
93+
traitlets==5.3.0
94+
Twisted==22.4.0
95+
twisted-iocpsupport==1.0.2
96+
typing_extensions==4.3.0
97+
ua-parser==0.16.1
98+
urllib3==1.26.11
99+
w3lib==1.22.0
100+
wcwidth==0.2.5
101+
websockets==10.3
102+
Werkzeug==2.0.1
103+
xlrd==2.0.1
104+
yahoo-fin==0.8.9.1
105+
yfinance==0.1.74
106+
zipp==3.8.1
107+
zope.interface==5.4.0

0 commit comments

Comments
 (0)