Bedtools Migration Guide

This guide maps bedtools commands to their GIQL equivalents. If you’re familiar with bedtools and want to replicate specific operations in GIQL, use this reference to find the corresponding query patterns.

Quick Reference Table

Bedtools Command

GIQL Equivalent

Recipe

intersect -a A -b B

SELECT DISTINCT a.* FROM a, b WHERE a.pos INTERSECTS b.pos

Default: Report overlaps between A and B

intersect -a A -b B -wa

SELECT a.* FROM a, b WHERE a.pos INTERSECTS b.pos

-wa: Write original A entry for each overlap

intersect -a A -b B -wb

SELECT b.* FROM a, b WHERE a.pos INTERSECTS b.pos

-wb: Write original B entry for each overlap

intersect -a A -b B -wa -wb

SELECT a.*, b.* FROM a, b WHERE a.pos INTERSECTS b.pos

-wa -wb: Write both A and B entries

intersect -a A -b B -v

SELECT a.* FROM a LEFT JOIN b ... WHERE b.chr IS NULL

-v: Report A entries with NO overlap in B

intersect -a A -b B -u

SELECT DISTINCT a.* FROM a JOIN b ...

-u: Report A entries with ANY overlap (unique)

intersect -a A -b B -c

SELECT a.*, COUNT(b.name) ... GROUP BY ...

-c: Count B overlaps for each A feature

intersect -a A -b B -wo

SELECT a.*, b.*, (overlap calculation) ...

-wo: Write overlap amount in base pairs

intersect -a A -b B -loj

SELECT a.*, b.* FROM a LEFT JOIN b ...

-loj: Left outer join

closest -a A -b B -k N

CROSS JOIN LATERAL NEAREST(b, reference=a.pos, k=N)

-k: Find k nearest features

closest -a A -b B -d

SELECT ..., DISTANCE(a.pos, b.pos) ...

-d: Report distance

cluster -i A

SELECT *, CLUSTER(interval) AS cluster_id FROM a

Basic clustering

cluster -i A -d N

SELECT *, CLUSTER(interval, N) AS cluster_id FROM a

-d: Cluster with distance parameter

merge -i A

SELECT MERGE(interval) FROM a

Basic merge

merge -i A -d N

SELECT MERGE(interval, N) FROM a

-d: Merge with distance parameter

merge -i A -c 1 -o count

SELECT MERGE(interval), COUNT(*) FROM a

-c -o count: Count merged features

bedtools intersect

Default: Report overlaps between A and B

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed

GIQL:

cursor = engine.execute("""
    SELECT DISTINCT a.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
""")

-wa: Write original A entry for each overlap

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -wa

GIQL:

cursor = engine.execute("""
    SELECT a.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
""")

-wb: Write original B entry for each overlap

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -wb

GIQL:

cursor = engine.execute("""
    SELECT b.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
""")

-wa -wb: Write both A and B entries

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -wa -wb

GIQL:

cursor = engine.execute("""
    SELECT a.*, b.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
""")

-v: Report A entries with NO overlap in B

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -v

GIQL:

cursor = engine.execute("""
    SELECT a.*
    FROM features_a a
    LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
    WHERE b.chromosome IS NULL
""")

-u: Report A entries with ANY overlap (unique)

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -u

GIQL:

cursor = engine.execute("""
    SELECT DISTINCT a.*
    FROM features_a a
    INNER JOIN features_b b ON a.interval INTERSECTS b.interval
""")

-c: Count B overlaps for each A feature

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -c

GIQL:

cursor = engine.execute("""
    SELECT a.*, COUNT(b.name) AS overlap_count
    FROM features_a a
    LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
    GROUP BY a.chromosome, a.start_pos, a.end_pos, a.name, a.score, a.strand
""")

-wo: Write overlap amount in base pairs

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -wo

GIQL:

cursor = engine.execute("""
    SELECT
        a.*,
        b.*,
        (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
""")

-wao: Write overlap amount for ALL A features

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -wao

GIQL:

cursor = engine.execute("""
    SELECT
        a.*,
        b.*,
        CASE
            WHEN b.chromosome IS NULL THEN 0
            ELSE LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
        END AS overlap_bp
    FROM features_a a
    LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
""")

-loj: Left outer join

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -loj

GIQL:

cursor = engine.execute("""
    SELECT a.*, b.*
    FROM features_a a
    LEFT JOIN features_b b ON a.interval INTERSECTS b.interval
""")

-s: Same strand overlaps only

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -s

GIQL:

cursor = engine.execute("""
    SELECT a.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
      AND a.strand = b.strand
""")

-S: Opposite strand overlaps only

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -S

GIQL:

cursor = engine.execute("""
    SELECT a.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
      AND a.strand != b.strand
      AND a.strand IN ('+', '-')
      AND b.strand IN ('+', '-')
""")

-f: Minimum overlap fraction of A

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -f 0.5

GIQL:

cursor = engine.execute("""
    SELECT a.*
    FROM features_a a, features_b b
    WHERE a.interval INTERSECTS b.interval
      AND (
          LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)
      ) >= 0.5 * (a.end_pos - a.start_pos)
""")

-r: Reciprocal overlap

Bedtools:

bedtools intersect -a file_a.bed -b file_b.bed -f 0.5 -r

GIQL:

cursor = engine.execute("""
    WITH overlap_calcs AS (
        SELECT
            a.*,
            (LEAST(a.end_pos, b.end_pos) - GREATEST(a.start_pos, b.start_pos)) AS overlap_bp,
            (a.end_pos - a.start_pos) AS a_length,
            (b.end_pos - b.start_pos) AS b_length
        FROM features_a a, features_b b
        WHERE a.interval INTERSECTS b.interval
    )
    SELECT chromosome, start_pos, end_pos, name, score, strand
    FROM overlap_calcs
    WHERE overlap_bp >= 0.5 * a_length
      AND overlap_bp >= 0.5 * b_length
""")

bedtools closest

-k: Find k nearest features

Bedtools:

bedtools closest -a peaks.bed -b genes.bed -k 3

GIQL:

cursor = engine.execute("""
    SELECT
        peaks.name AS peak,
        nearest.name AS gene,
        nearest.distance
    FROM peaks
    CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=3) AS nearest
    ORDER BY peaks.name, nearest.distance
""")

-d: Report distance

Bedtools:

bedtools closest -a peaks.bed -b genes.bed -d

GIQL:

cursor = engine.execute("""
    SELECT
        a.name AS peak,
        b.name AS gene,
        DISTANCE(a.interval, b.interval) AS distance
    FROM peaks a
    CROSS JOIN genes b
    WHERE a.chromosome = b.chromosome
    ORDER BY a.name, distance
""")

Or using NEAREST for just the closest:

cursor = engine.execute("""
    SELECT
        peaks.name AS peak,
        nearest.name AS gene,
        nearest.distance
    FROM peaks
    CROSS JOIN LATERAL NEAREST(genes, reference=peaks.interval, k=1) AS nearest
""")

-s: Same strand only

Bedtools:

bedtools closest -a peaks.bed -b genes.bed -s -k 3

GIQL:

cursor = engine.execute("""
    SELECT
        peaks.name,
        nearest.name AS gene,
        nearest.distance
    FROM peaks
    CROSS JOIN LATERAL NEAREST(
        genes,
        reference=peaks.interval,
        k=3,
        stranded=true
    ) AS nearest
    ORDER BY peaks.name, nearest.distance
""")

bedtools cluster

Basic clustering

Bedtools:

bedtools cluster -i features.bed

GIQL:

cursor = engine.execute("""
    SELECT
        *,
        CLUSTER(interval) AS cluster_id
    FROM features
    ORDER BY chromosome, start_pos
""")

-d: Cluster with distance parameter

Bedtools:

bedtools cluster -i features.bed -d 1000

GIQL:

cursor = engine.execute("""
    SELECT
        *,
        CLUSTER(interval, 1000) AS cluster_id
    FROM features
    ORDER BY chromosome, start_pos
""")

-s: Strand-specific clustering

Bedtools:

bedtools cluster -i features.bed -s

GIQL:

cursor = engine.execute("""
    SELECT
        *,
        CLUSTER(interval, stranded=true) AS cluster_id
    FROM features
    ORDER BY chromosome, strand, start_pos
""")

bedtools merge

Basic merge

Bedtools:

bedtools merge -i features.bed

GIQL:

cursor = engine.execute("""
    SELECT MERGE(interval)
    FROM features
""")

-d: Merge with distance parameter

Bedtools:

bedtools merge -i features.bed -d 1000

GIQL:

cursor = engine.execute("""
    SELECT MERGE(interval, 1000)
    FROM features
""")

-s: Strand-specific merge

Bedtools:

bedtools merge -i features.bed -s

GIQL:

cursor = engine.execute("""
    SELECT MERGE(interval, stranded=true)
    FROM features
""")

-c -o count: Count merged features

Bedtools:

bedtools merge -i features.bed -c 1 -o count

GIQL:

cursor = engine.execute("""
    SELECT
        MERGE(interval),
        COUNT(*) AS feature_count
    FROM features
""")

-c -o mean: Average score

Bedtools:

bedtools merge -i features.bed -c 5 -o mean

GIQL:

cursor = engine.execute("""
    SELECT
        MERGE(interval),
        AVG(score) AS avg_score
    FROM features
""")

-c -o collapse: Collect names

Bedtools:

bedtools merge -i features.bed -c 4 -o collapse

GIQL:

cursor = engine.execute("""
    SELECT
        MERGE(interval),
        STRING_AGG(name, ',') AS feature_names
    FROM features
""")

Key Differences from Bedtools

  1. SQL-based syntax: GIQL uses SQL syntax, which may be more familiar to users with database experience and allows integration with other SQL features.

  2. Explicit joins: Instead of implicit A/B file relationships, GIQL uses explicit JOIN syntax, making the relationship between tables clearer.

  3. Flexible output: SQL’s SELECT clause gives you full control over which columns to return and how to format them.

  4. Built-in aggregation: SQL’s GROUP BY and aggregate functions (COUNT, AVG, SUM, etc.) are available directly, without needing separate post-processing.

  5. Database integration: GIQL queries run against database tables, enabling integration with other data and persistence of results.

  6. Multi-backend support: The same GIQL query can run on DuckDB, SQLite, or other supported backends without modification.