Google Gemini Integration

Combine Supacrawler web scraping with Google Gemini AI for advanced content analysis and multimodal processing. This guide shows you how to scrape content and process it through Gemini for insights, analysis, and intelligent automation.

Prerequisites

Quick example

Scrape content and analyze it with Google Gemini:

import requests
import os
import google.generativeai as genai
from supacrawler import SupacrawlerClient

# Initialize clients
supacrawler = SupacrawlerClient(api_key=os.environ['SUPACRAWLER_API_KEY'])
genai.configure(api_key=os.environ['GOOGLE_AI_API_KEY'])

def scrape_and_analyze_with_gemini(url, analysis_prompt):
    """Scrape content and analyze with Gemini"""
    
    # Step 1: Scrape content with Supacrawler
    result = supacrawler.scrape(url, format="markdown", include_screenshot=True)
    
    # Step 2: Process with Gemini
    model = genai.GenerativeModel('gemini-1.5-flash')
    
    # For text analysis
    text_response = model.generate_content([
        f"Analyze this web content from {url}:",
        result.content,
        f"\nAnalysis focus: {analysis_prompt}"
    ])
    
    # For image analysis (if screenshot available)
    image_analysis = None
    if result.screenshot_url:
        screenshot_response = requests.get(result.screenshot_url)
        if screenshot_response.status_code == 200:
            image_response = model.generate_content([
                "Analyze this webpage screenshot for visual elements, design, and user experience:",
                {"mime_type": "image/png", "data": screenshot_response.content}
            ])
            image_analysis = image_response.text
    
    return {
        'url': url,
        'title': result.title,
        'content_analysis': text_response.text,
        'visual_analysis': image_analysis,
        'content': result.content
    }

# Example usage
result = scrape_and_analyze_with_gemini(
    url="https://techcrunch.com/ai",
    analysis_prompt="Identify key AI trends and their potential business impact"
)

print(f"Content Analysis: {result['content_analysis']}")
if result['visual_analysis']:
    print(f"Visual Analysis: {result['visual_analysis']}")

Multimodal content analysis

Leverage Gemini's multimodal capabilities:

import google.generativeai as genai
from supacrawler import SupacrawlerClient

def multimodal_website_analysis(url):
    """Comprehensive multimodal analysis of a website"""
    
    # Scrape with both content and screenshot
    result = supacrawler.scrape(url, 
        format="markdown",
        include_screenshot=True,
        screenshot_full_page=True
    )
    
    model = genai.GenerativeModel('gemini-1.5-pro')  # Pro for complex analysis
    
    # Download screenshot for analysis
    screenshot_response = requests.get(result.screenshot_url)
    screenshot_data = screenshot_response.content
    
    # Multimodal analysis prompt
    analysis_prompt = """
    Analyze this website comprehensively using both the text content and visual screenshot:
    
    1. Content Analysis:
       - Main topics and themes
       - Key information and data points
       - Content quality and structure
    
    2. Visual Analysis:
       - Design and layout quality
       - User experience elements
       - Visual hierarchy and information architecture
       - Brand presentation
    
    3. Combined Insights:
       - How well the visual design supports the content
       - Overall effectiveness for the target audience
       - Recommendations for improvement
    
    Provide structured insights in JSON format.
    """
    
    # Send both text and image to Gemini
    response = model.generate_content([
        analysis_prompt,
        f"Website URL: {url}",
        f"Text Content:\n{result.content}",
        {
            "mime_type": "image/png",
            "data": screenshot_data
        }
    ])
    
    return {
        'url': url,
        'title': result.title,
        'analysis': response.text,
        'content_length': len(result.content),
        'screenshot_url': result.screenshot_url
    }

# Automated competitive analysis
def analyze_competitor_websites():
    """Analyze multiple competitor websites"""
    
    competitors = [
        "https://competitor1.com",
        "https://competitor2.com", 
        "https://competitor3.com"
    ]
    
    analyses = []
    
    for competitor_url in competitors:
        try:
            analysis = multimodal_website_analysis(competitor_url)
            analyses.append(analysis)
            print(f"✅ Analyzed: {competitor_url}")
        except Exception as e:
            print(f"❌ Failed to analyze {competitor_url}: {e}")
    
    # Generate comparative report
    model = genai.GenerativeModel('gemini-1.5-pro')
    
    comparative_prompt = f"""
    Based on these competitor website analyses, provide:
    1. Comparative strengths and weaknesses
    2. Common industry patterns
    3. Opportunities for differentiation
    4. Actionable recommendations
    
    Analyses: {json.dumps([a['analysis'] for a in analyses], indent=2)}
    """
    
    comparative_report = model.generate_content(comparative_prompt)
    
    return {
        'individual_analyses': analyses,
        'comparative_report': comparative_report.text
    }

Automated monitoring with AI insights

Set up continuous monitoring with Gemini analysis:

def setup_gemini_monitoring():
    """Setup content monitoring with Gemini AI analysis"""
    
    monitoring_targets = [
        {
            "name": "Product Updates",
            "url": "https://company.com/product-updates",
            "analysis_focus": "product features, pricing changes, roadmap updates"
        },
        {
            "name": "Industry News",
            "url": "https://industry-site.com/news",
            "analysis_focus": "market trends, regulatory changes, competitive moves"
        },
        {
            "name": "Research Publications",
            "url": "https://research-site.com/papers",
            "analysis_focus": "technical breakthroughs, research methodologies, applications"
        }
    ]
    
    for target in monitoring_targets:
        # Create watch job with webhook
        response = requests.post("https://api.supacrawler.com/api/v1/watch",
            headers={"Authorization": f"Bearer {SUPACRAWLER_API_KEY}"},
            json={
                "url": target["url"],
                "frequency": "daily",
                "selector": "article, .post, .content",
                "notification_preference": "changes_only",
                
                # Include both content and visuals
                "include_html": True,
                "include_image": True,
                "full_page": True,
                
                # Send to Gemini analysis webhook
                "webhook_url": "https://your-app.com/api/gemini-analysis",
                "webhook_headers": {
                    "X-Target": target["name"],
                    "X-Analysis-Focus": target["analysis_focus"]
                }
            }
        )
        
        print(f"Gemini monitoring setup for {target['name']}")

# Webhook handler for Gemini analysis
@app.route('/api/gemini-analysis', methods=['POST'])
def process_with_gemini():
    """Process content changes with Gemini AI"""
    
    data = request.json
    target_name = data.get('headers', {}).get('X-Target')
    analysis_focus = data.get('headers', {}).get('X-Analysis-Focus')
    
    # Extract changes
    new_content = data.get('new_content', '')
    screenshot_url = data.get('screenshot_url')
    
    if new_content:
        # Initialize Gemini
        model = genai.GenerativeModel('gemini-1.5-flash')
        
        # Prepare analysis inputs
        inputs = [
            f"Analyze these website changes for {target_name}:",
            f"Focus areas: {analysis_focus}",
            f"New content:\n{new_content}"
        ]
        
        # Add screenshot if available
        if screenshot_url:
            screenshot_response = requests.get(screenshot_url)
            if screenshot_response.status_code == 200:
                inputs.append({
                    "mime_type": "image/png",
                    "data": screenshot_response.content
                })
        
        # Get Gemini analysis
        response = model.generate_content(inputs)
        analysis = response.text
        
        # Store and alert on important changes
        save_gemini_analysis({
            'target': target_name,
            'url': data.get('url'),
            'analysis': analysis,
            'content': new_content,
            'timestamp': data.get('timestamp')
        })
        
        # Check for high-priority insights
        if is_high_priority_insight(analysis):
            send_priority_alert(target_name, analysis)
    
    return jsonify({'status': 'analyzed'})

Advanced use cases

Content strategy analysis

  • Content gap analysis: Compare your content against competitors
  • SEO optimization: Analyze content structure and keyword usage
  • User experience evaluation: Assess content clarity and engagement
  • Brand consistency: Monitor brand messaging across touchpoints

Market intelligence

  • Trend detection: Identify emerging trends from multiple sources
  • Sentiment analysis: Understand market sentiment from news and social media
  • Competitive positioning: Analyze competitor strategies and messaging
  • Customer research: Process customer feedback and reviews for insights

Best practices

  • Model selection: Use Gemini 1.5 Flash for speed, Pro for complex analysis
  • Prompt engineering: Craft specific, detailed prompts for better results
  • Multimodal advantage: Leverage both text and visual analysis capabilities
  • Rate limiting: Implement proper rate limiting for API calls
  • Cost optimization: Monitor usage and optimize for cost-effectiveness

Was this page helpful?