티스토리 뷰

Python/Note

selenium 안드로이드 앱 크롤링

j0n9m1n1 j0n9m1n1 2019. 2. 13. 16:50
'''
pip install selenium
pip install requests
chromedriver 필요

용량제한X
카테고리 별 분류 필요
app 들의 urls 수집 후 
for each list 그리고 그 url에 있는 카테고리 
폴더가 없으면 생성 후 다운로드
있으면 거기에 다운로드
킹론상 25페이지 20개 = 500개 다운로드
'''
import os, sys, time
import requests
from selenium import webdriver
 
page = 25
in_page = 20
sleep_time = 30
chromePath = 'C:/Users/hacke/Desktop/for_reaLife/chromedriver'
download_base = 'C:/Users/hacke/Desktop/for_reaLife/crawling_app/'
 
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome(chromePath)
 
list_links = [[""] * in_page for i in range(page)]
# list_category = [[""] * in_page for i in range(page)]
# list_filename = [[""] * in_page for i in range(page)]
 
for i in range(1, page + 1): # for i in range(1, page + 1):
   driver.get('https://apk.support/apps-updated/?f_page='+str(i))
   for j in range(1, in_page + 1):
      if i is 1:
         attr = driver.find_element_by_xpath('/html/body/div[5]/div[2]/ul/li['+str(j)+']/dl/a')
      else:
         attr = driver.find_element_by_xpath('/html/body/div[6]/div[2]/ul/li['+str(j)+']/dl/a')
      href = attr.get_attribute('href')
      list_links[i - 1][j - 1] = href.replace("app", "download-app", 1)
 
# driver.quit()
 
for i in range(page):
   for j in range(in_page):
      driver.get(list_links[i][j])
      category = str(driver.find_element_by_xpath('/html/body/div[4]/div[1]/a[3]').text)
      # list_category[i][j] = category
      
      check_dir = os.listdir('crawling_app/')
      if category not in check_dir:
         os.mkdir('crawling_app/'+category)
      else:
         pass
      #self dynamic download path zz
      driver.quit()
      prefs = {"download.default_directory" : download_base + category}
      chromeOptions.add_experimental_option("prefs",prefs)
      driver = webdriver.Chrome(chromePath, 0, chrome_options=chromeOptions)
      driver.get(list_links[i][j])
      driver.implicitly_wait(5)
      print(i, j)
 
      try:
         driver.find_element_by_xpath('/html/body/div[5]/div[1]/div[1]/a').click()
         time.sleep(sleep_time)
      except:
         try:
            driver.find_element_by_xpath('/html/body/div[5]/div[4]/a').click()
            time.sleep(sleep_time)
         except:
            try:
               driver.find_element_by_xpath('/html/body/div[5]/div[2]/a').click()
               time.sleep(sleep_time)
            except:
               pass
               
# downloadPath = ~~~
# fileDestination = downloadPath+newFileName+fileExtension
# os.rename(currentFile, fileDestination)
 
print(list_links)
time.sleep(10)
 

다운로드중 검사 안들어가있음(if filename in ".crdownload":~~)

플레이스토어가 아니라 신뢰도 떨어짐

다 부족 함 참고용

 

댓글
댓글쓰기 폼

티스토리 방명록