-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcategory product scraper.py
128 lines (110 loc) · 5.78 KB
/
category product scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from typing import Optional, Generator, List, Dict, Any
import pandas as pd
from requests_html import HTMLSession
class DigikalaScraper:
"""Scraper for Digikala.com website that will scrape the products metadata.
Attributes:
session: `requests_html.HTMLSession` object.
filters: A dictionary of search filters query parameters.
"""
def __init__(self) -> None:
self.session = HTMLSession()
self.filters = {
'only_digiplus': 'only_plus=1',
'only_supermarkets': 'only_fresh=1',
'ship_by_seller': 'has_ship_by_seller=1',
'jet_delivery': 'has_jet_delivery=1',
'available_items': 'has_selling_stock=1',
'available_in_stores': 'has_ready_to_shipment=1',
'seller_digikala': 'seller_types[0]=digikala',
'seller_official': 'seller_types[1]=official',
'seller_trusted': 'seller_types[2]=trusted',
'seller_indigenous': 'seller_types[3]=roosta',
'most_relevant': 'sort=22',
'most_viewed': 'sort=4',
'newest': 'sort=1',
'best_selling': 'sort=7',
'cheapest': 'sort=20',
'most_expensive': 'sort=21',
'fastest_post': 'sort=25',
}
def _scrape_page_products(self, url: str) -> Generator[Dict[str, str], None, None]:
"""Scrapes and yields products data from a page."""
print(f'Scraping {url}...')
response = self.session.get(url)
response.html.render(timeout=50, sleep=2) # Wait 2 second after the page is loaded
products = response.html.xpath('//a[contains(@class, "d-block pointer pos-relative")]')
for product in products:
name = product.xpath('//h2[contains(@class, "ellipsis-2 text-body2-strong")] | .//h3[contains(@class, "ellipsis-2 text-body2-strong")]', first=True).text
price = product.xpath('//div[@class="pt-1 d-flex flex-column ai-stretch jc-between"]//div[contains(@class, "jc-end gap-1")]', first=True)
price = price.text if price is not None else 'ناموجود'
discount = product.xpath('//div[@class="pt-1 d-flex flex-column ai-stretch jc-between"]//div[contains(@class, "__discountWrapper__")]', first=True)
discount = discount.text if discount is not None else '-'
star = product.xpath('//div[@class="grow-1 d-flex flex-column ai-stretch jc-start"]//p[contains(@class, "text-body2-strong")]', first=True)
star = star.text if star is not None else '-'
link = f'https://www.digikala.com{product.attrs["href"]}'
yield {
'name': name,
'price': price,
'discount': discount,
'star': star,
'link': link,
}
def get_products(self, subject: str, pages_limit: Optional[int] = None,
filters: Optional[List[str]] = None) -> List[Dict[str, str]]:
"""Scrapes products data from Digikala.com website.
Args:
subject: The subject of the search.
pages_limit: The number of pages to scrape. Default is 3.
filters: A list of filters to apply to the search. By default,
no filters are applied and all products are scraped.
Returns:
A list that contains dictionaries of products data such as product name,
price, discount, star and link.
"""
if pages_limit is None:
pages_limit = 3
if filters is not None:
filters = '&'.join([self.filters[filter] for filter in filters])
products = []
for page_no in range(1, pages_limit+1):
url = f'https://www.digikala.com/search/{subject}/?page={page_no}&force_search_instead=1'
url = f'{url}&{filters}' if filters is not None else url # `None` has a bad effect on the search
products.extend(self._scrape_page_products(url))
return products
def _get_dict_keys_by_indexes(
dictionary: Dict[Any, Any],
indexes: List[int],
) -> Generator[Any, None, None]:
"""Yields a list of keys based on the given indexes."""
for idx, key in enumerate(dictionary):
if idx in indexes:
yield key
if __name__ == '__main__':
import sys
from pathlib import Path
digikala_scraper = DigikalaScraper()
print('\n\t\tDigikala.com Scraper: Scrape and export products data in few seconds easily\n')
subject = input('Enter the subject to search for: ')
pages_limit = input('Enter the number of pages to scrape (default is 3): ')
pages_limit = int(pages_limit) if pages_limit.isnumeric() else None
filters = input("""Some filters can be applied to the search results:
1. Only DigiPlus 10. Sells by indiendigenous sellers
2. Only supermarkets 11. Most relevant
3. Ship by the seller 12. Most viewed
4. Fast delivery 13. Newest
5. Only available items 14. Best selling
6. Only available in Digikala's stores 15. Cheapest
7. Sells by Digikala itself 16. Most expensive
8. Sells officially 17. Fastest post
9. Sells by trusted sellers
\b\b\b\bEnter the filters to apply (no filters by default): """)
filters = _get_dict_keys_by_indexes(digikala_scraper.filters, [int(filter)-1 for filter in filters.split()])
print() # Just to make the output look better
results = pd.DataFrame(digikala_scraper.get_products(subject, pages_limit, filters if filters else None))
if results.empty:
print('No results found. Please try again later.')
sys.exit(1)
Path('raw_Data').mkdir(exist_ok=True)
results.to_csv(f'raw_Data/{subject}.csv', index=False)
print('All done! Check the results in the raw_Data folder.')