# Init Chrome web driver with extensions (if applicable)
# Import section
from selenium import webdriver
import time
import re # Regexp to extract info from string
import json
import datetime
import os
import re
import copy
import logging
# Store structure (.json)
"""
{
" SearchKeyStr " : " М С К _Т ве р с ко й" ,
" SearchTitleStr " : " Москва, район Тверской " , # Title of the search [str]
" SearchURLStr " : " https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon % 5B1 % 5D=37.6166_55.7678 % 2C37.6147_55.7688 % 2C37.6114_55.7694 % 2C37.6085_55.7698 % 2C37.6057_55.77 % 2C37.6018_55.77 % 2C37.5987_55.77 % 2C37.5961_55.7688 % 2C37.5942_55.7677 % 2C37.5928_55.7663 % 2C37.5915_55.7647 % 2C37.5908_55.7631 % 2C37.5907_55.7616 % 2C37.5909_55.7595 % 2C37.5922_55.7577 % 2C37.5944_55.7563 % 2C37.5968_55.7555 % 2C37.6003_55.7547 % 2C37.603_55.7543 % 2C37.6055_55.7542 % 2C37.6087_55.7541 % 2C37.6113_55.7548 % 2C37.6135_55.756 % 2C37.6151_55.7574 % 2C37.6163_55.7589 % 2C37.6179_55.7606 % 2C37.6187_55.7621 % 2C37.619_55.7637 % 2C37.6194_55.7651 % 2C37.6193_55.7667 % 2C37.6178_55.7679 % 2C37.6153_55.7683 % 2C37.6166_55.7678&offer_type=flat&polygon_name % 5B1 % 5D= % D0 %9E % D0 % B1 % D0 % BB % D0 % B0 % D1 %81% D1 %82% D1 % 8C+ % D0 % BF % D0 % BE % D0 % B8 % D1 %81% D0 % BA % D0 % B0&room1=1&room2=1 " , # URL of the CIAN search [str]
" SearchDatetimeStr " : " 2020-08-01 09:33:00.838081 " , # Date of data extraction, [str]
" SearchItems " : {
" https://www.cian.ru/sale/flat/219924574/: " : { # Item URL with https
" TitleStr " : " 3-комн. кв., 31,4 м², 5/8 этаж " , # Offer title [str]
" PriceFloat " : 10000000.0 , # Price [float]
" PriceSqmFloat " : 133333.0 , # CALCULATED Price per square meters [float]
" SqMFloat " : 31.4 , # Square meters in flat [float]
" FloorCurrentInt " : 5 , # Current floor [int]
" FloorTotalInt " : 8 , # Current floor [int]
" RoomCountInt " : 3 # Room couint [int]
}
}
}
"""
##########################
# Init the Chrome web driver
###########################
gChromeExeFullPath = r ' .. \ Resources \ GoogleChromePortable \ App \ Chrome-bin \ chrome.exe '
gExtensionFullPathList = [ ]
gWebDriverFullPath = r ' .. \ Resources \ SeleniumWebDrivers \ Chrome \ chromedriver_win32 v84.0.4147.30 \ chromedriver.exe '
def WebDriverInit ( inWebDriverFullPath , inChromeExeFullPath , inExtensionFullPathList ) :
# Set full path to exe of the chrome
lWebDriverChromeOptionsInstance = webdriver . ChromeOptions ( )
lWebDriverChromeOptionsInstance . binary_location = inChromeExeFullPath
# Add extensions
for lExtensionItemFullPath in inExtensionFullPathList :
lWebDriverChromeOptionsInstance . add_extension ( lExtensionItemFullPath )
# Run chrome instance
lWebDriverInstance = None
if inWebDriverFullPath :
# Run with specified web driver path
lWebDriverInstance = webdriver . Chrome ( executable_path = inWebDriverFullPath , options = lWebDriverChromeOptionsInstance )
else :
lWebDriverInstance = webdriver . Chrome ( options = lWebDriverChromeOptionsInstance )
# Return the result
return lWebDriverInstance
from selenium . webdriver . common . by import By
from selenium . webdriver . support . ui import WebDriverWait
from selenium . webdriver . support import expected_conditions as EC
# def to extract list of offers from one job
def OffersByJobExtractDict ( inLogger , inWebDriver , inJob ) :
# BUG 0 - if timeout - retry the job +
# BUG 1 - do mouse scroll to to emulate user activity - cian can hold the robot
# BUG 2 - check the page to retry job offer if page is not next +
# BUG 3 - RE fall on Апартаменты-студия, 85,6 м², 4/8 этаж +
lRetryJobBool = True # Init flag if some error is raised - retry
while lRetryJobBool :
lRetryJobBool = False # Set false until some another action will appear
lResult = copy . deepcopy ( inJob ) # do copy the structure
lFilterURLStr = lResult [ " SearchURLStr " ]
inWebDriver . get ( lFilterURLStr ) # Open the URL
lDatetimeNowStr = str ( datetime . datetime . now ( ) )
lResult . update ( {
" SearchDatetimeStr " : lDatetimeNowStr , # Date of data extraction, [str]
" SearchItems " : { } # prepare the result
} )
# Get List of the page
lNextPageItemCSS = ' div[data-name= " Pagination " ] li[class*= " active " ] + li a '
lNextPageItem = inWebDriver . find_element_by_css_selector ( lNextPageItemCSS )
lPageCounterInt = 1 # Init the page counter
while lNextPageItem :
lPageNumberInt = int ( inWebDriver . find_element_by_css_selector ( css_selector = ' li[class*= " --active-- " ] span ' ) . text ) # Get the current page int from web and check with iterator (if not equal - retry all job)
if lPageNumberInt == lPageCounterInt :
lOfferListCSSStr = ' div[data-name= " Offers " ] > div:not([data-name= " BannerServicePlaceInternal " ]):not([data-name= " getBannerMarkup " ]):not([data-name= " AdFoxBannerTracker " ]) '
lOfferList = inWebDriver . find_elements_by_css_selector ( css_selector = lOfferListCSSStr )
for lOfferItem in lOfferList : # Processing the item, extract info
lOfferItemInfo = { # Item URL with https
" TitleStr " : " 3-комн. кв., 31,4 м², 5/8 этаж " , # Offer title [str]
" PriceFloat " : 10000000.0 , # Price [float]
" PriceSqmFloat " : 133333.0 , # CALCULATED Price per square meters [float]
" SqMFloat " : 31.4 , # Square meters in flat [float]
" FloorCurrentInt " : 5 , # Current floor [int]
" FloorTotalInt " : 8 , # Current floor [int]
" RoomCountInt " : 3 # Room couint [int]
}
lTitleStr = lOfferItem . find_element_by_css_selector ( css_selector = ' div[data-name= " TopTitle " ],div[data-name= " Title " ] ' ) . text # Extract title text
if inLogger : inLogger . info ( f " Старт обработки предложения: { lTitleStr } " )
lPriceStr = lOfferItem . find_element_by_css_selector ( css_selector = ' div[data-name= " Price " ] > div[class*= " header " ],div[data-name= " TopPrice " ] > div[class*= " header " ] ' ) . text # Extract total price
lURLStr = lOfferItem . find_element_by_css_selector ( css_selector = ' a[class*= " --header-- " ] ' ) . get_attribute ( " href " ) # Extract offer URL
lOfferItemInfo [ " TitleStr " ] = lTitleStr # set the title
lPriceStr = lPriceStr . replace ( " " , " " ) . replace ( " ₽ " , " " ) # Remove some extra symbols
lOfferItemInfo [ " PriceFloat " ] = round ( float ( lPriceStr ) , 2 ) # Convert price to the float type
#Check if Апартаменты
if " АПАРТАМЕНТЫ " in lTitleStr . upper ( ) :
lREResult = re . search ( r " .*, ( \ d*,? \ d*) м², ( \ d*)/( \ d*) эта. " , lTitleStr ) # run the re
lOfferItemInfo [ " RoomCountInt " ] = 1 # Room count
lSqmStr = lREResult . group ( 1 )
lSqmStr = lSqmStr . replace ( " , " , " . " )
lOfferItemInfo [ " SqMFloat " ] = round ( float ( lSqmStr ) , 2 ) # sqm count
lOfferItemInfo [ " FloorCurrentInt " ] = int ( lREResult . group ( 2 ) ) # Floor current
lOfferItemInfo [ " FloorTotalInt " ] = int ( lREResult . group ( 3 ) ) # Floor total
lOfferItemInfo [ " PriceSqmFloat " ] = round ( lOfferItemInfo [ " PriceFloat " ] / lOfferItemInfo [ " SqMFloat " ] , 2 ) # Sqm per M
else :
lREResult = re . search ( r " .*( \ d)-комн. .*, ( \ d*,? \ d*) м², ( \ d*)/( \ d*) эта. " , lTitleStr ) # run the re
lOfferItemInfo [ " RoomCountInt " ] = int ( lREResult . group ( 1 ) ) # Room count
lSqmStr = lREResult . group ( 2 )
lSqmStr = lSqmStr . replace ( " , " , " . " )
lOfferItemInfo [ " SqMFloat " ] = round ( float ( lSqmStr ) , 2 ) # sqm count
lOfferItemInfo [ " FloorCurrentInt " ] = int ( lREResult . group ( 3 ) ) # Floor current
lOfferItemInfo [ " FloorTotalInt " ] = int ( lREResult . group ( 4 ) ) # Floor total
lOfferItemInfo [ " PriceSqmFloat " ] = round ( lOfferItemInfo [ " PriceFloat " ] / lOfferItemInfo [ " SqMFloat " ] , 2 ) # Sqm per M
lResult [ ' SearchItems ' ] [ lURLStr ] = lOfferItemInfo # Set item in result dict
# Click next page item
lNextPageItem = None
lNextPageList = inWebDriver . find_elements_by_css_selector ( lNextPageItemCSS )
if len ( lNextPageList ) > 0 :
lNextPageItem = lNextPageList [ 0 ]
try :
#lNextPageItem = WebDriverWait(lWebDriver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[data-name="Pagination"]')))
#lNextPageItem.click()
inWebDriver . execute_script ( """ document.querySelector( ' div[data-name= " Pagination " ] li[class*= " active " ] + li a ' ).click() """ )
except Exception as e :
print ( e )
time . sleep ( 0.5 ) # some init operations
# wait while preloader is active. If timeout - retry all job
lTimeFromFLoat = time . time ( ) # get current time in float (seconds)
lDoWaitBool = True
while lDoWaitBool :
lPreloaderCSS = inWebDriver . find_elements_by_css_selector ( css_selector = ' div[class*= " --preloadOverlay-- " ] ' )
if len ( lPreloaderCSS ) > 0 : time . sleep ( 0.5 ) # preloader is here - wait
else : lDoWaitBool = False # Stop wait if preloader is dissappear
if ( time . time ( ) - lTimeFromFLoat ) > 15 : # check if timeout is more than 15 seconds
lRetryJobBool = True # Loading error on page - do break, then retry the job
if inLogger : inLogger . warning ( f " Ожидание загрузки страницы более { 15 } с ., Робот повторит задание сначала " )
break # break the loop
if lRetryJobBool == True : # break the loop if RetryJobBool is true
break
lPageCounterInt = lPageCounterInt + 1 # Increment the page counter
else :
lRetryJobBool = True
if inLogger : inLogger . warning (
f " Следующая страница по списку не была загружена. Была загружена страница: { lPageNumberInt } , Ожидалась страница: { lPageCounterInt } " )
if lRetryJobBool == False : # break the loop if RetryJobBool is true
# Check dir - create if not exists
if not os . path . exists ( os . path . join ( ' Datasets ' , lResult [ ' SearchKeyStr ' ] ) ) :
os . makedirs ( os . path . join ( ' Datasets ' , lResult [ ' SearchKeyStr ' ] ) )
# Save result in file
lFile = open ( f " { os . path . join ( ' Datasets ' , lResult [ ' SearchKeyStr ' ] , lDatetimeNowStr . replace ( ' ' , ' _ ' ) . replace ( ' - ' , ' _ ' ) . replace ( ' : ' , ' _ ' ) . replace ( ' . ' , ' _ ' ) ) } .json " , " w " , encoding = " utf-8 " )
lFile . write ( json . dumps ( lResult ) )
lFile . close ( )
# Инициализировать Google Chrome with selenium web driver
lWebDriver = WebDriverInit ( inWebDriverFullPath = gWebDriverFullPath , inChromeExeFullPath = gChromeExeFullPath , inExtensionFullPathList = gExtensionFullPathList )
lFilterURLStr = " https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon % 5B1 % 5D=37.6166_55.7678 % 2C37.6147_55.7688 % 2C37.6114_55.7694 % 2C37.6085_55.7698 % 2C37.6057_55.77 % 2C37.6018_55.77 % 2C37.5987_55.77 % 2C37.5961_55.7688 % 2C37.5942_55.7677 % 2C37.5928_55.7663 % 2C37.5915_55.7647 % 2C37.5908_55.7631 % 2C37.5907_55.7616 % 2C37.5909_55.7595 % 2C37.5922_55.7577 % 2C37.5944_55.7563 % 2C37.5968_55.7555 % 2C37.6003_55.7547 % 2C37.603_55.7543 % 2C37.6055_55.7542 % 2C37.6087_55.7541 % 2C37.6113_55.7548 % 2C37.6135_55.756 % 2C37.6151_55.7574 % 2C37.6163_55.7589 % 2C37.6179_55.7606 % 2C37.6187_55.7621 % 2C37.619_55.7637 % 2C37.6194_55.7651 % 2C37.6193_55.7667 % 2C37.6178_55.7679 % 2C37.6153_55.7683 % 2C37.6166_55.7678&offer_type=flat&polygon_name % 5B1 % 5D= % D0 %9E % D0 % B1 % D0 % BB % D0 % B0 % D1 %81% D1 %82% D1 % 8C+ % D0 % BF % D0 % BE % D0 % B8 % D1 %81% D0 % BA % D0 % B0&room1=1&room2=1 "
lJobItem = {
" SearchKeyStr " : " М С К _Т ве р с ко й" ,
" SearchTitleStr " : " Москва, район Тверской " , # Title of the search [str]
" SearchURLStr " : lFilterURLStr ,
# URL of the CIAN search [str]
}
OffersByJobExtractDict ( inLogger = logging , inWebDriver = lWebDriver , inJob = lJobItem )