-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathplatts.py
More file actions
110 lines (76 loc) · 2.68 KB
/
platts.py
File metadata and controls
110 lines (76 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
from selenium import webdriver
import pandas as pd
def initialise_platts_driver(url, path):
"""
Initialise the chrome driver
:param url: platts url
:type url: str
:param path: driver path
:type path: str
:return: webdriver
"""
# Initialise the driver
chrome_options = webdriver.ChromeOptions()
# This setting prevent website from sending notifications
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(path, chrome_options=chrome_options)
# Target URL
driver.get(url)
return driver
def click_options(driver):
"""
choose topics for platts news based on user input
:param driver: webdriver
:type driver: webdriver
:return: webdriver
"""
driver.find_element_by_xpath('//label[@for="commodity2"]').click()
driver.find_element_by_xpath('//label[@for="commodity7"]').click()
driver.find_element_by_xpath('//label[@for="commodity8"]').click()
return driver
def extract_news(driver):
"""
extract news:
- title
- topic
- date
- url
:param driver: webdriver
:type driver: webdriver
:return: driver, pandas Dataframe
"""
news_topic = [i.text for i in driver.find_elements_by_xpath("//div[@class='newsId']/a[@data-gtm-category='News Feed']/div/ul/li[@class='meta-data__type']")]
news_date = [i.text for i in driver.find_elements_by_xpath("//div[@class='newsId']/a[@data-gtm-category='News Feed']/div/ul/li[@class='meta-data__date']")]
news_title = [i.text for i in driver.find_elements_by_xpath("//div[@class='newsId']/a/div/h2")]
news_url = [i.get_attribute('href') for i in driver.find_elements_by_xpath("//div[@class='newsId']/a")]
res_df = pd.DataFrame({
'title': news_title,
'topic': news_topic,
'date': news_date,
'url': news_url
})
return driver, res_df
def load_more_page(driver):
"""
Load more news by clicking Load More
:param driver: webdriver
:type driver: webdriver
:return: webdriver
"""
driver.find_element_by_id("loadMoreNews").click()
return driver
def scroll_down(driver):
"""
This function will simulate the scroll down of the webpage
:param driver: webdriver
:type driver: webdriver
:return: webdriver
"""
# Selenium supports execute JavaScript commands in current window / frame
# get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
# scroll to the end of the page
driver.execute_script("window.scrollTo(0, {});".format(last_height))
return driver