Update 16 Oktober 2019, Kode Menggunakan 2x Selector :)tutorial kali ini saya akan berbagi cara scrape sitemap wordpress menggunakan bahasa pemrograman python dengan menggunakan library beautifulsoup dan library pendukung lainnya.
awal niatan untuk membuat tools scrape ini adalah ketika saya merasa sangat lambat menggunakan bahasa pemrograman php. saya mencoba dengan python tapi masih tetap saja lambat :v mungkin memang situs web yang saya scrape servernya lambat.
langsung saja saya mulai eksperimannya :
- Pertama pastikan anda sudah menginstall aplikasi python
- Lanjut menginstall Library yang dibutuhkan
- pip install beautifulsoup4
- pip install requests
- pip install htmlmin
- Kemudian buat sebuah file dengan format .py dan
isi dengan kode python dibawah ini :
import requests import io import htmlmin import html from bs4 import BeautifulSoup # define sitemap url sitemap_post_url = 'https://apkhome.net/post-sitemap64.xml' # get xml last path sitemap_post_url_lastpath = sitemap_post_url.split("/")[3] xml_name = sitemap_post_url_lastpath + ".xml" # define selector dynamic selector2_title = 'div.short-detail > a > h2' selector2_date = 'meta' selector2_date_property = 'article:published_time' selector2_category = 'div.short-detail > p > a' selector2_content = 'div.post-content > div.post-main-content' # define selector2 dynamic selector_title = 'div.details > div.p10 > dl > dd > div.p1 > h2' selector_date = 'meta' selector_date_property = 'article:published_time' selector_category = 'div.details-title > a:nth-child(3)' selector_content = 'div.my-container > div.details' # define function =============================================================== # =============================================================================== def get_title(result): post_title = result.string post_title = html.escape(post_title) return post_title def get_category(result): post_category_arr = [] for caq in result: category_name = "<category scheme='http://www.blogger.com/atom/ns#' term='" + \ html.escape(caq.get_text()) + "' />" post_category_arr.append(category_name) # end for post_category_remove_same_value = list(set(post_category_arr)) post_category = ''.join(post_category_remove_same_value) return post_category def get_content(result,selector): if selector == 1: # work for selector 2nd #result.find("div", {'class':'p10'}).decompose() #result.find("div", {'class':'additional'}).decompose() for div in result.find_all("div", {'class':'p10'}): div.decompose() for div in result.find_all("div", {'class':'below-com-widget'}): div.decompose() for div in result.find_all("div", {'class':'additional'}): div.decompose() [x.extract() for x in result.find_all(['script', 'noscript', 'ins', 'style', 'link', 'meta'])] change_image = result.find_all('img',{'data-src':True}) for img in change_image: img['src'] = img['data-src'] del img['data-src'] del img['data-lazyloaded'] # end for # change object to string !!! result = repr(result) result = htmlmin.minify(result, remove_empty_space=True, remove_optional_attribute_quotes=False) result = html.escape(result) return result # start scraping !!! ============================================================ # =============================================================================== # =============================================================================== # request url sitemap post req_post = requests.get(sitemap_post_url) # define request to text req_post_text = req_post.text # define beautifulsoup (html.parse) soup_sitemap_post = BeautifulSoup(req_post_text, "html.parser") # select url tag sitemap_post_tags = soup_sitemap_post.find_all("url") # write count sitemap post url print("Jumlah Sitemap Post {0}".format(len(sitemap_post_tags))) # remove xml ============================================= # ======================================================= open(xml_name, 'w').close() # write xml ============================================= # ======================================================= with open(xml_name, "a", encoding="utf-8") as f: f.write("<?xml version='1.0' encoding='UTF-8'?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0' xmlns:georss='http://www.georss.org/georss'><generator version='7.00' uri='https://www.blogger.com'>Blogger</generator>") # number for post id post_number = 1 # selector 2 false use_selector2 = 0 for sitemap_post in sitemap_post_tags: # get post url post_url = sitemap_post.findNext("loc").text # requst url post req_detail = requests.get(post_url) # define beautifulsoup (html.parse) soup_detail = BeautifulSoup(req_detail.content, "html.parser") # if selector 2 true ========================================== # ============================================================= if use_selector2 == 1: # use selector 2 =============================================================== # ============================================================================== post_title = soup_detail.select_one(selector2_title) post_date = soup_detail.find(selector2_date, property=selector_date_property) post_category_select = soup_detail.select(selector2_category) post_content = soup_detail.select_one(selector2_content) else: # use selector 1 =============================================================== # ============================================================================== # get title dynamic post_title = soup_detail.select_one(selector_title) if repr(post_title) == 'None': print("Selector Title Tidak Ditemukan, Mencoba Menggunakan Selector 2nd") #check again use selector 2nd :) post_title = soup_detail.select_one(selector2_title) if repr(post_title) == 'None': print("Selector2 Title Tidak Ditemukan, Periksa Kembali Kodenya") break else: # set selector 2 true use_selector2 = 1 # end if #end if # get date dynamic post_date = soup_detail.find(selector_date, property=selector_date_property) if repr(post_date) == 'None': print("Selector Date Tidak Ditemukan, Mencoba Menggunakan Selector 2nd") #check again use selector 2nd :) post_date = soup_detail.find(selector_date, property=selector_date_property) if repr(post_date) == 'None': print("Selector2 Date Tidak Ditemukan, Periksa Kembali Kodenya") break #end if #end if # get category dynamic post_category_select = soup_detail.select(selector_category) if len(post_category_select) == 0: print("Selector Category Tidak Ditemukan, Mencoba Menggunakan Selector 2nd") #check again use selector 2nd :) post_category_select = soup_detail.select(selector2_category) if len(post_category_select) == 0: print("Selector Category Tidak Ditemukan, Periksa Kembali Kodenya") break #end if #end if # get content dynamic post_content_select = soup_detail.select_one(selector_content) if repr(post_content_select) == 'None': print("Selector Content Tidak Ditemukan, Menggunakan Selector 2nd") #check again use selector 2nd :) post_content_select = soup_detail.select_one(selector2_content) if repr(post_content_select) == 'None': print("Selector Content Tidak Ditemukan, Periksa Kembali Kodenya") break #end if #end if #end if # after get all data create variable post_title = get_title(post_title) post_date = str(post_date['content']) post_category = get_category(post_category_select) post_content = get_content(post_content_select,use_selector2) #print(post_title) #print(post_date) #print(post_category) #print(post_content) #exit() # write xml ============================================= # ======================================================= with open(xml_name, "a", encoding="utf-8") as f: f.write("<entry><id>post-" + str(post_number) + "</id><published>" + str(post_date) + "</published><updated>" + str(post_date) + "</updated><category scheme='http://schemas.google.com/g/2005#kind' term='http://schemas.google.com/blogger/2008/kind#post' />" + post_category + "<title type='text'>" + str(post_title) + "</title><content type='html'>" + post_content + "</content><author><name>Scraper</name></author></entry>") # end with # write post number for log script run or no print("%d__%s" % (post_number, post_url)) # post number plus post_number += 1 # end for # write xml ============================================= # ======================================================= with open(xml_name, "a", encoding="utf-8") as f: f.write("</feed>")
- selanjutnya silahkan anda simpan dan jalankan filenya menggunakan text editor IDLE bawaan python dengan mengklik kanan file python tersebut kemudian pilih edit with idle, lihat gambar dibawah ini :
- untuk menjalankan programnya anda tinggal klik menu run lalu run module atau bisa juga pencet f5 pada keyboard
- jika berhasil akan muncul seperti ini :
- kalian bisa melihat hasil scrapenya sejajar dengan file python yang anda jalankan
File xml ini bisa langsung di import ke blogger.
Untuk penjelasan kodenya mungkin lain kali atau kalian bisa mengutak atiknya sendiri.
Jika kode tidak bekerja bisa jadi situs target melakukan update terhadap template yang dipakainya sehingga selector tidak berfungsi, untuk memperbaikinya dengan melakukan tindakan update selector
Referensi :
https://stackoverflow.com/questions/14587728/what-does-this-error-in-beautiful-soup-means
Komentar
Selector2 Title Tidak Ditemukan, Periksa Kembali Kodenya" gimana mengatasinya