# tutorial 2 example is working

dev-linux
Ivan Maslov 4 years ago
parent 96401e74cb
commit 2e351f9d86

@ -2,9 +2,13 @@
# Import section # Import section
from selenium import webdriver from selenium import webdriver
import time import time
import re # Regexp to extract info from string
import json
import datetime
# Store structure (.json) # Store structure (.json)
""" """
{ {
"SearchKeyStr": "SPB",
"SearchTitleStr": "Search in SPB, Russia", # Title of the search [str] "SearchTitleStr": "Search in SPB, Russia", # Title of the search [str]
"SearchURLStr": "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823%2C30.2815_59.9821&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1", # URL of the CIAN search [str] "SearchURLStr": "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823%2C30.2815_59.9821&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1", # URL of the CIAN search [str]
"SearchDatetimeStr": "2020-08-01 09:33:00.838081", # Date of data extraction, [str] "SearchDatetimeStr": "2020-08-01 09:33:00.838081", # Date of data extraction, [str]
@ -16,6 +20,7 @@ import time
"SqMFloat": 31.4, # Square meters in flat [float] "SqMFloat": 31.4, # Square meters in flat [float]
"FloorCurrentInt": 5, # Current floor [int] "FloorCurrentInt": 5, # Current floor [int]
"FloorTotalInt": 8, # Current floor [int] "FloorTotalInt": 8, # Current floor [int]
"RoomCountInt": 3 # Room couint [int]
} }
} }
@ -52,19 +57,46 @@ from selenium.webdriver.support import expected_conditions as EC
# Инициализировать Google Chrome with selenium web driver # Инициализировать Google Chrome with selenium web driver
lWebDriver = WebDriverInit(inWebDriverFullPath = gWebDriverFullPath, inChromeExeFullPath = gChromeExeFullPath, inExtensionFullPathList = gExtensionFullPathList) lWebDriver = WebDriverInit(inWebDriverFullPath = gWebDriverFullPath, inChromeExeFullPath = gChromeExeFullPath, inExtensionFullPathList = gExtensionFullPathList)
lFilterURLStr = "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1" lFilterURLStr = "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1"
lWebDriver.get(lFilterURLStr) lWebDriver.get(lFilterURLStr) # Open the URL
lDatetimeNowStr = str(datetime.datetime.now())
lResult = {
"SearchKeyStr": "SPB",
"SearchTitleStr": "Search in SPB, Russia", # Title of the search [str]
"SearchURLStr": "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823%2C30.2815_59.9821&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1", # URL of the CIAN search [str]
"SearchDatetimeStr": lDatetimeNowStr, # Date of data extraction, [str]
"SearchItems": {} # prepare the result
}
# Get List of the page # Get List of the page
lOfferListCSSStr = 'div[data-name="Offers"] > div:not([data-name="BannerServicePlaceInternal"]):not([data-name="getBannerMarkup"]):not([data-name="AdFoxBannerTracker"])' lOfferListCSSStr = 'div[data-name="Offers"] > div:not([data-name="BannerServicePlaceInternal"]):not([data-name="getBannerMarkup"]):not([data-name="AdFoxBannerTracker"])'
lOfferList = lWebDriver.find_elements_by_css_selector(css_selector=lOfferListCSSStr) lOfferList = lWebDriver.find_elements_by_css_selector(css_selector=lOfferListCSSStr)
lNextPageItemCSS = 'div[data-name="Pagination"] li[class*="active"] + li a' lNextPageItemCSS = 'div[data-name="Pagination"] li[class*="active"] + li a'
lNextPageItem = lWebDriver.find_element_by_css_selector(lNextPageItemCSS) lNextPageItem = lWebDriver.find_element_by_css_selector(lNextPageItemCSS)
while lNextPageItem: while lNextPageItem:
for lOfferItem in lOfferList: for lOfferItem in lOfferList: # Processing the item, extract info
lOfferItemInfo = { # Item URL with https
"TitleStr": "3-комн. кв., 31,4 м², 5/8 этаж", # Offer title [str]
"PriceFloat": 10000000.0, # Price [float]
"PriceSqmFloat": 133333.0, # CALCULATED Price per square meters [float]
"SqMFloat": 31.4, # Square meters in flat [float]
"FloorCurrentInt": 5, # Current floor [int]
"FloorTotalInt": 8, # Current floor [int]
"RoomCountInt": 3 # Room couint [int]
}
lTitleStr = lOfferItem.find_element_by_css_selector(css_selector='div[data-name="TopTitle"],div[data-name="Title"]').text lTitleStr = lOfferItem.find_element_by_css_selector(css_selector='div[data-name="TopTitle"],div[data-name="Title"]').text
lPriceStr = lOfferItem.find_element_by_css_selector(css_selector='div[data-name="Price"] > div[class*="header"],div[data-name="TopPrice"] > div[class*="header"]').text lPriceStr = lOfferItem.find_element_by_css_selector(css_selector='div[data-name="Price"] > div[class*="header"],div[data-name="TopPrice"] > div[class*="header"]').text
lURLStr = lOfferItem.find_element_by_css_selector(css_selector='a[class*="--header--"]').get_attribute("href") lURLStr = lOfferItem.find_element_by_css_selector(css_selector='a[class*="--header--"]').get_attribute("href")
print(f"Title: {lTitleStr}, Price: {lPriceStr}") lOfferItemInfo["TitleStr"] = lTitleStr # set the title
print(f"URL: {lURLStr}") lPriceStr = lPriceStr.replace(" ","").replace("","")
lOfferItemInfo["PriceFloat"] = round(float(lPriceStr),2) # Set the price
lREResult = re.match(r"(\d)-комн. .*, (\d*,?\d*) м², (\d*)/(\d*) эта.", lTitleStr) # run the re
lOfferItemInfo["RoomCountInt"] = lREResult.group(1) # Room count
lSqmStr = lREResult.group(2)
lSqmStr= lSqmStr.replace(",",".")
lOfferItemInfo["SqMFloat"] = round(float(lSqmStr),2) # sqm count
lOfferItemInfo["FloorCurrentInt"] = int(lREResult.group(3)) # Floor current
lOfferItemInfo["FloorTotalInt"] = int(lREResult.group(4)) # Floor total
lOfferItemInfo["PriceSqmFloat"] = round(lOfferItemInfo["PriceFloat"] / lOfferItemInfo["SqMFloat"],2) # Sqm per M
lResult['SearchItems'][lURLStr] = lOfferItemInfo # Set item in result dict
# Click next page item # Click next page item
lNextPageItem = None lNextPageItem = None
lNextPageList = lWebDriver.find_elements_by_css_selector(lNextPageItemCSS) lNextPageList = lWebDriver.find_elements_by_css_selector(lNextPageItemCSS)
@ -78,4 +110,7 @@ while lNextPageItem:
print(e) print(e)
time.sleep(3) time.sleep(3)
lOfferList = lWebDriver.find_elements_by_css_selector(css_selector=lOfferListCSSStr) lOfferList = lWebDriver.find_elements_by_css_selector(css_selector=lOfferListCSSStr)
print("Over!") # Save result in file
lFile = open(f"{lResult['SearchKeyStr']}_{lDatetimeNowStr.replace(' ','_').replace('-','_').replace(':','_')}","w",encoding="utf-8")
lFile.write(json.dumps(lResult))
lFile.close()
Loading…
Cancel
Save