Spaces:
Runtime error
Runtime error
John Yang commited on
Commit ·
730ca01
1
Parent(s): 69177fb
Code clean up
Browse files- .gitignore +1 -0
- app.py +9 -10
- predict_help.py +17 -13
.gitignore
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
*.pyc
|
|
|
|
| 2 |
|
| 3 |
.DS_Store
|
|
|
|
| 1 |
*.pyc
|
| 2 |
+
*.ipynb
|
| 3 |
|
| 4 |
.DS_Store
|
app.py
CHANGED
|
@@ -119,7 +119,7 @@ def run_episode(goal, env, verbose=True):
|
|
| 119 |
search_results_cache = {}
|
| 120 |
visited_asins, clicked_options = set(), set()
|
| 121 |
sub_page_type, page_type, page_num = None, None, None
|
| 122 |
-
search_terms, prod_title, asin
|
| 123 |
options = {}
|
| 124 |
|
| 125 |
for i in range(100):
|
|
@@ -228,7 +228,6 @@ def run_episode(goal, env, verbose=True):
|
|
| 228 |
print(f"Parsing search results took {end-begin} seconds")
|
| 229 |
|
| 230 |
search_results_cache[search_terms] = data
|
| 231 |
-
num_prods = len(data)
|
| 232 |
for d in data:
|
| 233 |
title_to_asin_map[d['Title']] = d['asin']
|
| 234 |
elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
|
|
@@ -268,7 +267,7 @@ def run_episode(goal, env, verbose=True):
|
|
| 268 |
# Dict of Info -> Valid Action State (Info)
|
| 269 |
begin = time.time()
|
| 270 |
prod_arg = product_map if page_type == Page.ITEM_PAGE else data
|
| 271 |
-
info = convert_dict_to_actions(page_type, prod_arg, asin, page_num
|
| 272 |
end = time.time()
|
| 273 |
if verbose:
|
| 274 |
print("Extracting available actions took", end-begin, "seconds")
|
|
@@ -294,19 +293,19 @@ def run_episode(goal, env, verbose=True):
|
|
| 294 |
return_value['Selected Options'] = ', '.join(list(clicked_options))
|
| 295 |
return return_value
|
| 296 |
|
| 297 |
-
gr.Interface(fn=run_episode
|
| 298 |
inputs=[
|
| 299 |
gr.inputs.Textbox(lines=7, label="Input Text"),
|
| 300 |
gr.inputs.Radio(['Amazon', 'eBay'], type="value", default="Amazon", label='Environment')
|
| 301 |
-
]
|
| 302 |
-
outputs="text"
|
| 303 |
examples=[
|
| 304 |
["I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars", "Amazon"],
|
| 305 |
["I need some cute heart-shaped glittery cupcake picks as a gift to bring to a baby shower", "Amazon"],
|
| 306 |
["I'm trying to find white bluetooth speakers that are not only water resistant but also come with stereo sound", "eBay"],
|
| 307 |
["find me the soy free 3.5 ounce 4-pack of dang thai rice chips, and make sure they are the aged cheddar flavor. i also need the ones in the resealable bags", "eBay"]
|
| 308 |
-
]
|
| 309 |
-
title="WebShop"
|
| 310 |
-
article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>"
|
| 311 |
-
description="<p style='text-align:center;'>Sim-to-real transfer of agent trained on WebShop to search a desired product on Amazon from any natural language query!</p>"
|
| 312 |
).launch(inline=False)
|
|
|
|
| 119 |
search_results_cache = {}
|
| 120 |
visited_asins, clicked_options = set(), set()
|
| 121 |
sub_page_type, page_type, page_num = None, None, None
|
| 122 |
+
search_terms, prod_title, asin = None, None, None
|
| 123 |
options = {}
|
| 124 |
|
| 125 |
for i in range(100):
|
|
|
|
| 228 |
print(f"Parsing search results took {end-begin} seconds")
|
| 229 |
|
| 230 |
search_results_cache[search_terms] = data
|
|
|
|
| 231 |
for d in data:
|
| 232 |
title_to_asin_map[d['Title']] = d['asin']
|
| 233 |
elif page_type == Page.ITEM_PAGE or page_type == Page.SUB_PAGE:
|
|
|
|
| 267 |
# Dict of Info -> Valid Action State (Info)
|
| 268 |
begin = time.time()
|
| 269 |
prod_arg = product_map if page_type == Page.ITEM_PAGE else data
|
| 270 |
+
info = convert_dict_to_actions(page_type, prod_arg, asin, page_num)
|
| 271 |
end = time.time()
|
| 272 |
if verbose:
|
| 273 |
print("Extracting available actions took", end-begin, "seconds")
|
|
|
|
| 293 |
return_value['Selected Options'] = ', '.join(list(clicked_options))
|
| 294 |
return return_value
|
| 295 |
|
| 296 |
+
gr.Interface(fn=run_episode,
|
| 297 |
inputs=[
|
| 298 |
gr.inputs.Textbox(lines=7, label="Input Text"),
|
| 299 |
gr.inputs.Radio(['Amazon', 'eBay'], type="value", default="Amazon", label='Environment')
|
| 300 |
+
],
|
| 301 |
+
outputs="text",
|
| 302 |
examples=[
|
| 303 |
["I want to find a gold floor lamp with a glass shade and a nickel finish that i can use for my living room, and price lower than 270.00 dollars", "Amazon"],
|
| 304 |
["I need some cute heart-shaped glittery cupcake picks as a gift to bring to a baby shower", "Amazon"],
|
| 305 |
["I'm trying to find white bluetooth speakers that are not only water resistant but also come with stereo sound", "eBay"],
|
| 306 |
["find me the soy free 3.5 ounce 4-pack of dang thai rice chips, and make sure they are the aged cheddar flavor. i also need the ones in the resealable bags", "eBay"]
|
| 307 |
+
],
|
| 308 |
+
title="WebShop",
|
| 309 |
+
article="<p style='padding-top:15px;text-align:center;'>To learn more about this project, check out the <a href='https://webshop-pnlp.github.io/' target='_blank'>project page</a>!</p>",
|
| 310 |
+
description="<p style='text-align:center;'>Sim-to-real transfer of agent trained on WebShop to search a desired product on Amazon from any natural language query!</p>",
|
| 311 |
).launch(inline=False)
|
predict_help.py
CHANGED
|
@@ -22,12 +22,6 @@ NUM_PROD_LIMIT = 10
|
|
| 22 |
WEBSHOP_URL = "http://3.83.245.205:3000"
|
| 23 |
WEBSHOP_SESSION = "abc"
|
| 24 |
|
| 25 |
-
API = '85956985fae328bfe5a759a2984448d2'
|
| 26 |
-
def get_url(url):
|
| 27 |
-
payload = {'api_key': API, 'url': url , 'country_code': 'us'}
|
| 28 |
-
proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
|
| 29 |
-
return proxy_url
|
| 30 |
-
|
| 31 |
def parse_results_ebay(query, page_num=None, verbose=True):
|
| 32 |
query_string = '+'.join(query.split())
|
| 33 |
page_num = 1 if page_num is None else page_num
|
|
@@ -64,6 +58,7 @@ def parse_results_ebay(query, page_num=None, verbose=True):
|
|
| 64 |
print(f"Scraped {len(results)} products")
|
| 65 |
return results
|
| 66 |
|
|
|
|
| 67 |
def parse_item_page_ebay(asin, verbose=True):
|
| 68 |
product_dict = {}
|
| 69 |
product_dict["asin"] = asin
|
|
@@ -188,6 +183,7 @@ def parse_results_ws(query, page_num=None, verbose=True):
|
|
| 188 |
print(f"Scraped {len(results)} products")
|
| 189 |
return results
|
| 190 |
|
|
|
|
| 191 |
def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
| 192 |
product_dict = {}
|
| 193 |
product_dict["asin"] = asin
|
|
@@ -199,7 +195,7 @@ def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
|
| 199 |
f'{asin}/{query_string}/{page_num}/{options_string}'
|
| 200 |
)
|
| 201 |
if verbose:
|
| 202 |
-
print("Item Page URL: "
|
| 203 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 204 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
| 205 |
|
|
@@ -240,6 +236,8 @@ def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
|
| 240 |
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
|
| 241 |
f'{asin}/{query_string}/{page_num}/Description/{options_string}'
|
| 242 |
)
|
|
|
|
|
|
|
| 243 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 244 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
| 245 |
product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip()
|
|
@@ -249,6 +247,8 @@ def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
|
| 249 |
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
|
| 250 |
f'{asin}/{query_string}/{page_num}/Features/{options_string}'
|
| 251 |
)
|
|
|
|
|
|
|
| 252 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 253 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
| 254 |
bullets = soup.find(name="ul").findAll(name="li")
|
|
@@ -256,6 +256,7 @@ def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
|
| 256 |
|
| 257 |
return product_dict
|
| 258 |
|
|
|
|
| 259 |
# Query -> Search Result ASINs
|
| 260 |
def parse_results_amz(query, page_num=None, verbose=True):
|
| 261 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
|
@@ -289,6 +290,7 @@ def parse_results_amz(query, page_num=None, verbose=True):
|
|
| 289 |
print("Scraped", len(results), "products")
|
| 290 |
return results
|
| 291 |
|
|
|
|
| 292 |
# Scrape information of each product
|
| 293 |
def parse_item_page_amz(asin, verbose=True):
|
| 294 |
product_dict = {}
|
|
@@ -385,7 +387,9 @@ def parse_item_page_amz(asin, verbose=True):
|
|
| 385 |
product_dict["options"], product_dict["option_to_image"] = options, options_to_image
|
| 386 |
return product_dict
|
| 387 |
|
|
|
|
| 388 |
# Get text observation from html
|
|
|
|
| 389 |
def convert_html_to_text(html, simple=False, clicked_options=None, visited_asins=None):
|
| 390 |
def tag_visible(element):
|
| 391 |
ignore = {'style', 'script', 'head', 'title', 'meta', '[document]'}
|
|
@@ -419,18 +423,18 @@ def convert_html_to_text(html, simple=False, clicked_options=None, visited_asins
|
|
| 419 |
observation += processed_t + '\n'
|
| 420 |
return observation
|
| 421 |
|
| 422 |
-
|
| 423 |
-
|
|
|
|
| 424 |
info = {"valid": []}
|
| 425 |
if page_type == Page.RESULTS:
|
| 426 |
info["valid"] = ['click[back to search]']
|
| 427 |
-
if products is None or page_num is None
|
| 428 |
print(page_num)
|
| 429 |
-
print(num_prods)
|
| 430 |
print(products)
|
| 431 |
-
raise Exception('Provide `products`, `
|
| 432 |
# Decide whether to add `next >` as clickable based on # of search results
|
| 433 |
-
if
|
| 434 |
info["valid"].append('click[next >]')
|
| 435 |
# Add `< prev` as clickable if not first page of search results
|
| 436 |
if page_num > 1:
|
|
|
|
| 22 |
WEBSHOP_URL = "http://3.83.245.205:3000"
|
| 23 |
WEBSHOP_SESSION = "abc"
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def parse_results_ebay(query, page_num=None, verbose=True):
|
| 26 |
query_string = '+'.join(query.split())
|
| 27 |
page_num = 1 if page_num is None else page_num
|
|
|
|
| 58 |
print(f"Scraped {len(results)} products")
|
| 59 |
return results
|
| 60 |
|
| 61 |
+
|
| 62 |
def parse_item_page_ebay(asin, verbose=True):
|
| 63 |
product_dict = {}
|
| 64 |
product_dict["asin"] = asin
|
|
|
|
| 183 |
print(f"Scraped {len(results)} products")
|
| 184 |
return results
|
| 185 |
|
| 186 |
+
|
| 187 |
def parse_item_page_ws(asin, query, page_num, options, verbose=True):
|
| 188 |
product_dict = {}
|
| 189 |
product_dict["asin"] = asin
|
|
|
|
| 195 |
f'{asin}/{query_string}/{page_num}/{options_string}'
|
| 196 |
)
|
| 197 |
if verbose:
|
| 198 |
+
print(f"Item Page URL: {url}")
|
| 199 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 200 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
| 201 |
|
|
|
|
| 236 |
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
|
| 237 |
f'{asin}/{query_string}/{page_num}/Description/{options_string}'
|
| 238 |
)
|
| 239 |
+
if verbose:
|
| 240 |
+
print(f"Item Description URL: {url}")
|
| 241 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 242 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
| 243 |
product_dict["Description"] = soup.find(name="p", attrs={'class': 'product-info'}).text.strip()
|
|
|
|
| 247 |
f'{WEBSHOP_URL}/item_sub_page/{WEBSHOP_SESSION}/'
|
| 248 |
f'{asin}/{query_string}/{page_num}/Features/{options_string}'
|
| 249 |
)
|
| 250 |
+
if verbose:
|
| 251 |
+
print(f"Item Features URL: {url}")
|
| 252 |
webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
|
| 253 |
soup = BeautifulSoup(webpage.content, 'html.parser')
|
| 254 |
bullets = soup.find(name="ul").findAll(name="li")
|
|
|
|
| 256 |
|
| 257 |
return product_dict
|
| 258 |
|
| 259 |
+
|
| 260 |
# Query -> Search Result ASINs
|
| 261 |
def parse_results_amz(query, page_num=None, verbose=True):
|
| 262 |
url = 'https://www.amazon.com/s?k=' + query.replace(" ", "+")
|
|
|
|
| 290 |
print("Scraped", len(results), "products")
|
| 291 |
return results
|
| 292 |
|
| 293 |
+
|
| 294 |
# Scrape information of each product
|
| 295 |
def parse_item_page_amz(asin, verbose=True):
|
| 296 |
product_dict = {}
|
|
|
|
| 387 |
product_dict["options"], product_dict["option_to_image"] = options, options_to_image
|
| 388 |
return product_dict
|
| 389 |
|
| 390 |
+
|
| 391 |
# Get text observation from html
|
| 392 |
+
# TODO[john-b-yang]: Similar to web_agent_site/envs/...text_env.py func def, merge?
|
| 393 |
def convert_html_to_text(html, simple=False, clicked_options=None, visited_asins=None):
|
| 394 |
def tag_visible(element):
|
| 395 |
ignore = {'style', 'script', 'head', 'title', 'meta', '[document]'}
|
|
|
|
| 423 |
observation += processed_t + '\n'
|
| 424 |
return observation
|
| 425 |
|
| 426 |
+
|
| 427 |
+
# Get action from dict of values retrieved from html
|
| 428 |
+
def convert_dict_to_actions(page_type, products=None, asin=None, page_num=None) -> dict:
|
| 429 |
info = {"valid": []}
|
| 430 |
if page_type == Page.RESULTS:
|
| 431 |
info["valid"] = ['click[back to search]']
|
| 432 |
+
if products is None or page_num is None:
|
| 433 |
print(page_num)
|
|
|
|
| 434 |
print(products)
|
| 435 |
+
raise Exception('Provide `products`, `page_num` to get `results` valid actions')
|
| 436 |
# Decide whether to add `next >` as clickable based on # of search results
|
| 437 |
+
if len(products) > 10:
|
| 438 |
info["valid"].append('click[next >]')
|
| 439 |
# Add `< prev` as clickable if not first page of search results
|
| 440 |
if page_num > 1:
|