# Init Chrome web driver with extensions (if applicable)
# Import section
from selenium import webdriver
import time
# Store structure (.json)
"""
{
" SearchTitleStr " : " Search in SPB, Russia " , # Title of the search [str]
" SearchURLStr " : " https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon % 5B1 % 5D=30.2815_59.9821 % 2C30.2844_59.9821 % 2C30.2874_59.9821 % 2C30.29_59.9821 % 2C30.293_59.9822 % 2C30.2957_59.9824 % 2C30.2984_59.9824 % 2C30.3019_59.9824 % 2C30.3048_59.9824 % 2C30.3074_59.9824 % 2C30.3088_59.9835 % 2C30.3085_59.9848 % 2C30.3065_59.9859 % 2C30.3049_59.987 % 2C30.3035_59.9885 % 2C30.302_59.9897 % 2C30.2991_59.9902 % 2C30.2961_59.9904 % 2C30.2934_59.9903 % 2C30.2904_59.9898 % 2C30.2879_59.9893 % 2C30.2855_59.9888 % 2C30.2825_59.9882 % 2C30.2799_59.9879 % 2C30.2768_59.9874 % 2C30.2741_59.987 % 2C30.2716_59.9867 % 2C30.2688_59.9867 % 2C30.2657_59.9867 % 2C30.2626_59.9867 % 2C30.26_59.9867 % 2C30.2577_59.986 % 2C30.2576_59.9846 % 2C30.2588_59.9834 % 2C30.2611_59.9827 % 2C30.2641_59.9822 % 2C30.2667_59.9819 % 2C30.2697_59.9819 % 2C30.2726_59.9816 % 2C30.2753_59.9815 % 2C30.2781_59.9818 % 2C30.2807_59.9823 % 2C30.2833_59.9823 % 2C30.2815_59.9821&offer_type=flat&polygon_name % 5B1 % 5D= % D0 %9E % D0 % B1 % D0 % BB % D0 % B0 % D1 %81% D1 %82% D1 % 8C+ % D0 % BF % D0 % BE % D0 % B8 % D1 %81% D0 % BA % D0 % B0&room1=1&room2=1 " , # URL of the CIAN search [str]
" SearchDatetimeStr " : " 2020-08-01 09:33:00.838081 " , # Date of data extraction, [str]
" SearchItems " : {
" https://spb.cian.ru/sale/flat/777928777/: " : { # Item URL with https
" TitleStr " : " 3-комн. кв., 31,4 м², 5/8 этаж " , # Offer title [str]
" PriceFloat " : 10000000.0 , # Price [float]
" PriceSqmFloat " : 133333.0 , # CALCULATED Price per square meters [float]
" SqMFloat " : 31.4 , # Square meters in flat [float]
" FloorCurrentInt " : 5 , # Current floor [int]
" FloorTotalInt " : 8 , # Current floor [int]
}
}
}
"""
##########################
# Init the Chrome web driver
###########################
gChromeExeFullPath = r ' .. \ Resources \ GoogleChromePortable \ App \ Chrome-bin \ chrome.exe '
gExtensionFullPathList = [ ]
gWebDriverFullPath = r ' .. \ Resources \ SeleniumWebDrivers \ Chrome \ chromedriver_win32 v84.0.4147.30 \ chromedriver.exe '
def WebDriverInit ( inWebDriverFullPath , inChromeExeFullPath , inExtensionFullPathList ) :
# Set full path to exe of the chrome
lWebDriverChromeOptionsInstance = webdriver . ChromeOptions ( )
lWebDriverChromeOptionsInstance . binary_location = inChromeExeFullPath
# Add extensions
for lExtensionItemFullPath in inExtensionFullPathList :
lWebDriverChromeOptionsInstance . add_extension ( lExtensionItemFullPath )
# Run chrome instance
lWebDriverInstance = None
if inWebDriverFullPath :
# Run with specified web driver path
lWebDriverInstance = webdriver . Chrome ( executable_path = inWebDriverFullPath , options = lWebDriverChromeOptionsInstance )
else :
lWebDriverInstance = webdriver . Chrome ( options = lWebDriverChromeOptionsInstance )
# Return the result
return lWebDriverInstance
from selenium . webdriver . common . by import By
from selenium . webdriver . support . ui import WebDriverWait
from selenium . webdriver . support import expected_conditions as EC
# Инициализировать Google Chrome with selenium web driver
lWebDriver = WebDriverInit ( inWebDriverFullPath = gWebDriverFullPath , inChromeExeFullPath = gChromeExeFullPath , inExtensionFullPathList = gExtensionFullPathList )
lFilterURLStr = " https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon % 5B1 % 5D=30.2815_59.9821 % 2C30.2844_59.9821 % 2C30.2874_59.9821 % 2C30.29_59.9821 % 2C30.293_59.9822 % 2C30.2957_59.9824 % 2C30.2984_59.9824 % 2C30.3019_59.9824 % 2C30.3048_59.9824 % 2C30.3074_59.9824 % 2C30.3088_59.9835 % 2C30.3085_59.9848 % 2C30.3065_59.9859 % 2C30.3049_59.987 % 2C30.3035_59.9885 % 2C30.302_59.9897 % 2C30.2991_59.9902 % 2C30.2961_59.9904 % 2C30.2934_59.9903 % 2C30.2904_59.9898 % 2C30.2879_59.9893 % 2C30.2855_59.9888 % 2C30.2825_59.9882 % 2C30.2799_59.9879 % 2C30.2768_59.9874 % 2C30.2741_59.987 % 2C30.2716_59.9867 % 2C30.2688_59.9867 % 2C30.2657_59.9867 % 2C30.2626_59.9867 % 2C30.26_59.9867 % 2C30.2577_59.986 % 2C30.2576_59.9846 % 2C30.2588_59.9834 % 2C30.2611_59.9827 % 2C30.2641_59.9822 % 2C30.2667_59.9819 % 2C30.2697_59.9819 % 2C30.2726_59.9816 % 2C30.2753_59.9815 % 2C30.2781_59.9818 % 2C30.2807_59.9823 % 2C30.2833_59.9823&offer_type=flat&polygon_name % 5B1 % 5D= % D0 %9E % D0 % B1 % D0 % BB % D0 % B0 % D1 %81% D1 %82% D1 % 8C+ % D0 % BF % D0 % BE % D0 % B8 % D1 %81% D0 % BA % D0 % B0&room1=1&room2=1 "
lWebDriver . get ( lFilterURLStr )
# Get List of the page
lOfferListCSSStr = ' div[data-name= " Offers " ] > div:not([data-name= " BannerServicePlaceInternal " ]):not([data-name= " getBannerMarkup " ]):not([data-name= " AdFoxBannerTracker " ]) '
lOfferList = lWebDriver . find_elements_by_css_selector ( css_selector = lOfferListCSSStr )
lNextPageItemCSS = ' div[data-name= " Pagination " ] li[class*= " active " ] + li a '
lNextPageItem = lWebDriver . find_element_by_css_selector ( lNextPageItemCSS )
while lNextPageItem :
for lOfferItem in lOfferList :
lTitleStr = lOfferItem . find_element_by_css_selector ( css_selector = ' div[data-name= " TopTitle " ],div[data-name= " Title " ] ' ) . text
lPriceStr = lOfferItem . find_element_by_css_selector ( css_selector = ' div[data-name= " Price " ] > div[class*= " header " ],div[data-name= " TopPrice " ] > div[class*= " header " ] ' ) . text
lURLStr = lOfferItem . find_element_by_css_selector ( css_selector = ' a[class*= " --header-- " ] ' ) . get_attribute ( " href " )
print ( f " Title: { lTitleStr } , Price: { lPriceStr } " )
print ( f " URL: { lURLStr } " )
# Click next page item
lNextPageItem = None
lNextPageList = lWebDriver . find_elements_by_css_selector ( lNextPageItemCSS )
if len ( lNextPageList ) > 0 :
lNextPageItem = lNextPageList [ 0 ]
try :
#lNextPageItem = WebDriverWait(lWebDriver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[data-name="Pagination"]')))
#lNextPageItem.click()
lWebDriver . execute_script ( """ document.querySelector( ' div[data-name= " Pagination " ] li[class*= " active " ] + li a ' ).click() """ )
except Exception as e :
print ( e )
time . sleep ( 3 )
lOfferList = lWebDriver . find_elements_by_css_selector ( css_selector = lOfferListCSSStr )
print ( " Over! " )