Skip to content

Commit 0452173

Browse files
committed
warn the user when the same URL pattern is present in the rule
1 parent ee30808 commit 0452173

File tree

5 files changed

+201
-131
lines changed

5 files changed

+201
-131
lines changed

scrapy_poet/overrides.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from abc import ABC, abstractmethod
33
from collections import defaultdict
44
from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type, Union
5+
from warnings import warn
56

67
from scrapy import Request
78
from scrapy.crawler import Crawler
@@ -110,6 +111,26 @@ def add_rule(self, rule: RuleFromUser) -> None:
110111
for_patterns=Patterns([pattern]), use=use, instead_of=instead_of
111112
)
112113
self.rules.append(rule)
114+
115+
# A common case when a PO subclasses another one with the same URL
116+
# pattern. See the test_item_return_subclass() test case.
117+
matched = self.matcher[rule.to_return]
118+
if [
119+
pattern
120+
for pattern in matched.patterns.values()
121+
if pattern == rule.for_patterns
122+
]:
123+
# TODO: It would be great to also list down the rules having the
124+
# same URL pattern. But this would require some refactoring.
125+
warn(
126+
f"A similar URL pattern {list(matched.patterns.values())} has been "
127+
f"declared earlier which uses to_return={rule.to_return}. When "
128+
f"matching URLs against rules, the latest declared rule is used. "
129+
f"Consider explicitly updating the priority of the rules containing "
130+
f"the said URL pattern to easily match the expectations when reading "
131+
f"the code."
132+
)
133+
113134
if rule.instead_of:
114135
self.matcher[rule.instead_of].add_or_update(
115136
len(self.rules) - 1, rule.for_patterns
@@ -127,6 +148,7 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
127148
overrides[instead_of] = self.rules[rule_id].use
128149
return overrides
129150

151+
# TODO: Refactor later
130152
def rules_overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
131153
overrides: Dict[Callable, Callable] = {}
132154
for instead_of, matcher in self.matcher.items():

tests/po_lib/__init__.py

Lines changed: 0 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -3,122 +3,11 @@
33
"""
44
import socket
55

6-
import attrs
76
from url_matcher.util import get_domain
8-
from web_poet import Injectable, ItemPage, WebPage, field, handle_urls, item_from_fields
97

108
from tests.mockserver import get_ephemeral_port
119

1210
# Need to define it here since it's always changing
1311
DOMAIN = get_domain(socket.gethostbyname(socket.gethostname()))
1412
PORT = get_ephemeral_port()
1513
URL = f"{DOMAIN}:{PORT}"
16-
17-
18-
class POOverridenPage(WebPage):
19-
def to_item(self):
20-
return {"msg": "PO that will be replaced"}
21-
22-
23-
@handle_urls(URL, instead_of=POOverridenPage)
24-
class POIntegrationPage(WebPage):
25-
def to_item(self):
26-
return {"msg": "PO replacement"}
27-
28-
29-
@attrs.define
30-
class Product:
31-
name: str
32-
33-
34-
@attrs.define
35-
class ParentProduct:
36-
name: str
37-
38-
39-
@attrs.define
40-
class ReplacedProduct:
41-
name: str
42-
43-
44-
@attrs.define
45-
class ParentReplacedProduct:
46-
name: str
47-
48-
49-
@attrs.define
50-
class SubclassReplacedProduct:
51-
name: str
52-
53-
54-
@attrs.define
55-
class StandaloneProduct:
56-
name: str
57-
58-
59-
@attrs.define
60-
class ProductFromInjectable:
61-
name: str
62-
63-
64-
@handle_urls(URL)
65-
class ProductPage(ItemPage[Product]):
66-
@field
67-
def name(self) -> str:
68-
return "product's name"
69-
70-
71-
@handle_urls(URL)
72-
class ParentProductPage(ItemPage[ParentProduct]):
73-
@field
74-
def name(self) -> str:
75-
return "parent product's name"
76-
77-
78-
@handle_urls(URL, priority=600)
79-
class SubclassProductPage(ParentProductPage):
80-
@field
81-
def name(self) -> str:
82-
return "subclass product's name"
83-
84-
85-
@handle_urls(URL, to_return=ReplacedProduct)
86-
class ReplacedProductPage(ItemPage[Product]):
87-
@field
88-
def name(self) -> str:
89-
return "replaced product's name"
90-
91-
92-
@handle_urls(URL)
93-
class ParentReplacedProductPage(ItemPage[ParentReplacedProduct]):
94-
@field
95-
def name(self) -> str:
96-
return "parent replaced product's name"
97-
98-
99-
@handle_urls(URL, to_return=SubclassReplacedProduct)
100-
class SubclassReplacedProductPage(ParentReplacedProductPage):
101-
@field
102-
def name(self) -> str:
103-
return "subclass replaced product's name"
104-
105-
106-
@handle_urls(URL, to_return=StandaloneProduct)
107-
class StandaloneProductPage(ItemPage):
108-
@field
109-
def name(self) -> str:
110-
return "standalone product's name"
111-
112-
113-
# TODO: cases where `instead_of` and `to_return` are present, including
114-
# permutations of the cases above
115-
116-
117-
@handle_urls(URL, to_return=ProductFromInjectable)
118-
class ProductFromInjectablePage(Injectable):
119-
@field
120-
def name(self) -> str:
121-
return "product from injectable"
122-
123-
async def to_item(self) -> ProductFromInjectable:
124-
return await item_from_fields(self, item_cls=ProductFromInjectable)

tests/po_lib/main.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import attrs
2+
from web_poet import Injectable, ItemPage, WebPage, field, handle_urls, item_from_fields
3+
4+
from . import URL
5+
6+
7+
class POOverridenPage(WebPage):
8+
def to_item(self):
9+
return {"msg": "PO that will be replaced"}
10+
11+
12+
@handle_urls(URL, instead_of=POOverridenPage)
13+
class POIntegrationPage(WebPage):
14+
def to_item(self):
15+
return {"msg": "PO replacement"}
16+
17+
18+
@attrs.define
19+
class Product:
20+
name: str
21+
22+
23+
@attrs.define
24+
class ParentProduct:
25+
name: str
26+
27+
28+
@attrs.define
29+
class PriorityParentProduct:
30+
name: str
31+
32+
33+
@attrs.define
34+
class ReplacedProduct:
35+
name: str
36+
37+
38+
@attrs.define
39+
class ParentReplacedProduct:
40+
name: str
41+
42+
43+
@attrs.define
44+
class SubclassReplacedProduct:
45+
name: str
46+
47+
48+
@attrs.define
49+
class StandaloneProduct:
50+
name: str
51+
52+
53+
@attrs.define
54+
class ProductFromInjectable:
55+
name: str
56+
57+
58+
@handle_urls(URL)
59+
class ProductPage(ItemPage[Product]):
60+
@field
61+
def name(self) -> str:
62+
return "product's name"
63+
64+
65+
@handle_urls(URL)
66+
class ParentProductPage(ItemPage[ParentProduct]):
67+
@field
68+
def name(self) -> str:
69+
return "parent product's name"
70+
71+
72+
@handle_urls(URL)
73+
class PriorityParentProductPage(ItemPage[PriorityParentProduct]):
74+
@field
75+
def name(self) -> str:
76+
return "priority parent product's name"
77+
78+
79+
@handle_urls(URL, to_return=ReplacedProduct)
80+
class ReplacedProductPage(ItemPage[Product]):
81+
@field
82+
def name(self) -> str:
83+
return "replaced product's name"
84+
85+
86+
@handle_urls(URL)
87+
class ParentReplacedProductPage(ItemPage[ParentReplacedProduct]):
88+
@field
89+
def name(self) -> str:
90+
return "parent replaced product's name"
91+
92+
93+
@handle_urls(URL, to_return=StandaloneProduct)
94+
class StandaloneProductPage(ItemPage):
95+
@field
96+
def name(self) -> str:
97+
return "standalone product's name"
98+
99+
100+
# TODO: cases where `instead_of` and `to_return` are present, including
101+
# permutations of the cases above
102+
103+
104+
@handle_urls(URL, to_return=ProductFromInjectable)
105+
class ProductFromInjectablePage(Injectable):
106+
@field
107+
def name(self) -> str:
108+
return "product from injectable"
109+
110+
async def to_item(self) -> ProductFromInjectable:
111+
return await item_from_fields(self, item_cls=ProductFromInjectable)

tests/po_lib/subclasses.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from web_poet import field, handle_urls
2+
3+
from . import URL
4+
from .main import (
5+
ParentProductPage,
6+
ParentReplacedProductPage,
7+
PriorityParentProductPage,
8+
SubclassReplacedProduct,
9+
)
10+
11+
12+
@handle_urls(URL)
13+
class SubclassProductPage(ParentProductPage):
14+
@field
15+
def name(self) -> str:
16+
return "subclass product's name"
17+
18+
19+
@handle_urls(URL, to_return=SubclassReplacedProduct)
20+
class SubclassReplacedProductPage(ParentReplacedProductPage):
21+
@field
22+
def name(self) -> str:
23+
return "subclass replaced product's name"
24+
25+
26+
@handle_urls(URL, priority=600)
27+
class PrioritySubclassProductPage(PriorityParentProductPage):
28+
@field
29+
def name(self) -> str:
30+
return "priority subclass product's name"

0 commit comments

Comments
 (0)