From 2e351f9d86aea38a6c1e47eca7203693632447b4 Mon Sep 17 00:00:00 2001 From: Ivan Maslov Date: Tue, 4 Aug 2020 22:27:48 +0300 Subject: [PATCH] # tutorial 2 example is working --- .../WebGUI_Habr/3. MonitoringCIAN_Run_64.py | 45 ++++++++++++++++--- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/Wiki/RUS_Tutorial/WebGUI_Habr/3. MonitoringCIAN_Run_64.py b/Wiki/RUS_Tutorial/WebGUI_Habr/3. MonitoringCIAN_Run_64.py index d3eab055..a5df21e0 100644 --- a/Wiki/RUS_Tutorial/WebGUI_Habr/3. MonitoringCIAN_Run_64.py +++ b/Wiki/RUS_Tutorial/WebGUI_Habr/3. MonitoringCIAN_Run_64.py @@ -2,9 +2,13 @@ # Import section from selenium import webdriver import time +import re # Regexp to extract info from string +import json +import datetime # Store structure (.json) """ { + "SearchKeyStr": "SPB", "SearchTitleStr": "Search in SPB, Russia", # Title of the search [str] "SearchURLStr": "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823%2C30.2815_59.9821&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1", # URL of the CIAN search [str] "SearchDatetimeStr": "2020-08-01 09:33:00.838081", # Date of data extraction, [str] @@ -16,6 +20,7 @@ import time "SqMFloat": 31.4, # Square meters in flat [float] "FloorCurrentInt": 5, # Current floor [int] "FloorTotalInt": 8, # Current floor [int] + "RoomCountInt": 3 # Room couint [int] } } @@ -52,19 +57,46 @@ from selenium.webdriver.support import expected_conditions as EC # Инициализировать Google Chrome with selenium web driver lWebDriver = WebDriverInit(inWebDriverFullPath = gWebDriverFullPath, inChromeExeFullPath = gChromeExeFullPath, inExtensionFullPathList = gExtensionFullPathList) lFilterURLStr = "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1" -lWebDriver.get(lFilterURLStr) +lWebDriver.get(lFilterURLStr) # Open the URL +lDatetimeNowStr = str(datetime.datetime.now()) +lResult = { + "SearchKeyStr": "SPB", + "SearchTitleStr": "Search in SPB, Russia", # Title of the search [str] + "SearchURLStr": "https://spb.cian.ru/cat.php?deal_type=sale&engine_version=2&in_polygon%5B1%5D=30.2815_59.9821%2C30.2844_59.9821%2C30.2874_59.9821%2C30.29_59.9821%2C30.293_59.9822%2C30.2957_59.9824%2C30.2984_59.9824%2C30.3019_59.9824%2C30.3048_59.9824%2C30.3074_59.9824%2C30.3088_59.9835%2C30.3085_59.9848%2C30.3065_59.9859%2C30.3049_59.987%2C30.3035_59.9885%2C30.302_59.9897%2C30.2991_59.9902%2C30.2961_59.9904%2C30.2934_59.9903%2C30.2904_59.9898%2C30.2879_59.9893%2C30.2855_59.9888%2C30.2825_59.9882%2C30.2799_59.9879%2C30.2768_59.9874%2C30.2741_59.987%2C30.2716_59.9867%2C30.2688_59.9867%2C30.2657_59.9867%2C30.2626_59.9867%2C30.26_59.9867%2C30.2577_59.986%2C30.2576_59.9846%2C30.2588_59.9834%2C30.2611_59.9827%2C30.2641_59.9822%2C30.2667_59.9819%2C30.2697_59.9819%2C30.2726_59.9816%2C30.2753_59.9815%2C30.2781_59.9818%2C30.2807_59.9823%2C30.2833_59.9823%2C30.2815_59.9821&offer_type=flat&polygon_name%5B1%5D=%D0%9E%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D1%8C+%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%B0&room1=1&room2=1", # URL of the CIAN search [str] + "SearchDatetimeStr": lDatetimeNowStr, # Date of data extraction, [str] + "SearchItems": {} # prepare the result +} # Get List of the page lOfferListCSSStr = 'div[data-name="Offers"] > div:not([data-name="BannerServicePlaceInternal"]):not([data-name="getBannerMarkup"]):not([data-name="AdFoxBannerTracker"])' lOfferList = lWebDriver.find_elements_by_css_selector(css_selector=lOfferListCSSStr) lNextPageItemCSS = 'div[data-name="Pagination"] li[class*="active"] + li a' lNextPageItem = lWebDriver.find_element_by_css_selector(lNextPageItemCSS) while lNextPageItem: - for lOfferItem in lOfferList: + for lOfferItem in lOfferList: # Processing the item, extract info + lOfferItemInfo = { # Item URL with https + "TitleStr": "3-комн. кв., 31,4 м², 5/8 этаж", # Offer title [str] + "PriceFloat": 10000000.0, # Price [float] + "PriceSqmFloat": 133333.0, # CALCULATED Price per square meters [float] + "SqMFloat": 31.4, # Square meters in flat [float] + "FloorCurrentInt": 5, # Current floor [int] + "FloorTotalInt": 8, # Current floor [int] + "RoomCountInt": 3 # Room couint [int] + } lTitleStr = lOfferItem.find_element_by_css_selector(css_selector='div[data-name="TopTitle"],div[data-name="Title"]').text lPriceStr = lOfferItem.find_element_by_css_selector(css_selector='div[data-name="Price"] > div[class*="header"],div[data-name="TopPrice"] > div[class*="header"]').text lURLStr = lOfferItem.find_element_by_css_selector(css_selector='a[class*="--header--"]').get_attribute("href") - print(f"Title: {lTitleStr}, Price: {lPriceStr}") - print(f"URL: {lURLStr}") + lOfferItemInfo["TitleStr"] = lTitleStr # set the title + lPriceStr = lPriceStr.replace(" ","").replace("₽","") + lOfferItemInfo["PriceFloat"] = round(float(lPriceStr),2) # Set the price + lREResult = re.match(r"(\d)-комн. .*, (\d*,?\d*) м², (\d*)/(\d*) эта.", lTitleStr) # run the re + lOfferItemInfo["RoomCountInt"] = lREResult.group(1) # Room count + lSqmStr = lREResult.group(2) + lSqmStr= lSqmStr.replace(",",".") + lOfferItemInfo["SqMFloat"] = round(float(lSqmStr),2) # sqm count + lOfferItemInfo["FloorCurrentInt"] = int(lREResult.group(3)) # Floor current + lOfferItemInfo["FloorTotalInt"] = int(lREResult.group(4)) # Floor total + lOfferItemInfo["PriceSqmFloat"] = round(lOfferItemInfo["PriceFloat"] / lOfferItemInfo["SqMFloat"],2) # Sqm per M + lResult['SearchItems'][lURLStr] = lOfferItemInfo # Set item in result dict # Click next page item lNextPageItem = None lNextPageList = lWebDriver.find_elements_by_css_selector(lNextPageItemCSS) @@ -78,4 +110,7 @@ while lNextPageItem: print(e) time.sleep(3) lOfferList = lWebDriver.find_elements_by_css_selector(css_selector=lOfferListCSSStr) -print("Over!") \ No newline at end of file +# Save result in file +lFile = open(f"{lResult['SearchKeyStr']}_{lDatetimeNowStr.replace(' ','_').replace('-','_').replace(':','_')}","w",encoding="utf-8") +lFile.write(json.dumps(lResult)) +lFile.close() \ No newline at end of file