import sutro as so
import polars as pl
from pydantic import BaseModel
from random import randint
products_df = pl.read_parquet('hf://datasets/ckandemir/amazon-products/data/train-00000-of-00001.parquet')[0:20000]
personas_df = pl.read_parquet('hf://datasets/sutro/synthetic-humans-50k/chunk_0.parquet')
personas_df = personas_df.filter(
(pl.col('age') >= 22) &
(pl.col('age') <= 30) &
(pl.col('location').is_in([
'New York, New York',
'Los Angeles, California',
'Chicago, Illinois',
'Houston, Texas',
'Miami, Florida',
'Seattle, Washington',
'Boston, Massachusetts',
'San Francisco, California',
'Washington, D.C.',
'Atlanta, Georgia',
'Philadelphia, Pennsylvania',
'Phoenix, Arizona',
'San Diego, California',
'San Jose, California',
'Austin, Texas',
]))
)
def get_random_persona_demographic_summary():
row = personas_df.sample(1, seed=randint(0, 1000000))
return row['demographic_summary'][0]
random_personas = [get_random_persona_demographic_summary() for _ in range(len(products_df))]
products_df = products_df.with_columns(
pl.Series("persona", random_personas)
)
products_df = products_df.with_columns(
pl.concat_str([
pl.lit('Product Name: '),
pl.col('Product Name'),
pl.lit(' '),
pl.lit('Product Description: '),
pl.col('Description'),
pl.lit(' '),
pl.lit('Price: '),
pl.col('Selling Price'),
pl.lit(' '),
pl.lit('Reviewer Persona: '),
pl.col('persona'),
], ignore_nulls=True).alias('product_info')
)
system_prompt = """You will be given a product name, description, and price.
You will also be given a reviewer persona.
Your task is to generate a novel product review from the reviewer persona's perspective.
Include a title, text, author, product name, product
description, product category, and rating out of 5.
"""
class ProductReview(BaseModel):
review_title: str
review_text: str
review_author: str
product_name: str
product_description: str
product_category: str
rating_out_of_5: int
results = so.infer(
products_df[0:100],
column="product_info",
model="qwen-3-4b",
system_prompt=system_prompt,
output_schema=ProductReview
)