Skip to content

Commit e3b3cbb

Browse files
committed
feat: add article summary feature with OpenAI integration
1 parent ddc9c7b commit e3b3cbb

13 files changed

Lines changed: 1304 additions & 28 deletions

File tree

.golangci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ linters:
5454
disabled: true
5555
- name: nested-structs
5656
disabled: true
57+
- name: max-public-structs
58+
arguments: [6]
5759
- name: use-slices-sort
5860
disabled: true
5961
gocritic:

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@
2323
| openai-max-iter | OPENAI_MAX_ITER | `3` | max evaluation iterations per extraction |
2424
| dbg | DEBUG | `false` | debug mode |
2525

26+
#### OpenAI integration
27+
28+
| Command line | Environment | Default | Description |
29+
|-------------------------------|----------------------------|---------------|------------------------------------------------------------------|
30+
| openai.api-key | OPENAI_API_KEY | none | OpenAI API key for summary generation |
31+
| openai.model-type | OPENAI_MODEL_TYPE | `gpt-4o-mini` | OpenAI model name (e.g., gpt-4o, gpt-4o-mini) |
32+
| openai.disable-summaries | OPENAI_DISABLE_SUMMARIES | `false` | disable summary generation |
33+
| openai.summary-prompt | OPENAI_SUMMARY_PROMPT | built-in | custom prompt for summary generation |
34+
| openai.max-content-length | OPENAI_MAX_CONTENT_LENGTH | `10000` | maximum content length to send to OpenAI API (0 for no limit) |
35+
| openai.requests-per-minute | OPENAI_REQUESTS_PER_MINUTE | `10` | maximum OpenAI API requests per minute (0 for no limit) |
36+
| openai.cleanup-interval | OPENAI_CLEANUP_INTERVAL | `24h` | interval for cleaning up expired cached summaries |
37+
2638
### Cloudflare Browser Rendering (optional)
2739

2840
When both `--cf-account-id` and `--cf-api-token` are set, the service uses Cloudflare Browser Rendering API to fetch page content instead of direct HTTP. This renders JavaScript and handles bot-protection pages that return empty or "just a moment..." responses to standard HTTP requests.
@@ -44,9 +56,13 @@ When OpenAI is not configured, extraction works exactly as before — no GPT cal
4456
### API
4557

4658
GET /api/content/v1/parser?token=secret&url=http://aa.com/blah - extract content (emulate Readability API parse call)
59+
GET /api/content/v1/parser?token=secret&url=http://aa.com/blah&summary=true - extract content with AI-generated summary
4760
POST /api/extract {url: http://aa.com/blah} - extract content
61+
GET /api/metrics - summary generation metrics (cache hits, misses, response times)
4862
POST /api/content-parsed-wrong?url=http://aa.com/blah - force re-extraction with AI evaluation (requires basicAuth)
4963

64+
Summary generation requires a valid token and an OpenAI API key. Summaries are cached in MongoDB with a 1-month expiration. Expired summaries are cleaned up automatically on the configured interval.
65+
5066
## Development
5167

5268
### Running tests

datastore/mongo.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ func New(connectionURI, dbName string, delay time.Duration) (*MongoServer, error
3939

4040
// Stores contains all DAO instances
4141
type Stores struct {
42-
Rules RulesDAO
42+
Rules RulesDAO
43+
Summaries SummariesDAO
4344
}
4445

4546
// GetStores initialize collections and make indexes
@@ -50,8 +51,15 @@ func (m *MongoServer) GetStores() Stores {
5051
{Keys: bson.D{{Key: "domain", Value: 1}, {Key: "match_urls", Value: 1}}},
5152
}
5253

54+
sIndexes := []mongo.IndexModel{
55+
{Keys: bson.D{{Key: "created_at", Value: 1}}},
56+
{Keys: bson.D{{Key: "model", Value: 1}}},
57+
{Keys: bson.D{{Key: "expires_at", Value: 1}}}, // index for cleaning up expired summaries
58+
}
59+
5360
return Stores{
54-
Rules: RulesDAO{Collection: m.collection("rules", rIndexes)},
61+
Rules: RulesDAO{Collection: m.collection("rules", rIndexes)},
62+
Summaries: SummariesDAO{Collection: m.collection("summaries", sIndexes)},
5563
}
5664
}
5765

datastore/summaries.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Package datastore provides mongo implementation for store to keep and access summaries
2+
package datastore
3+
4+
import (
5+
"context"
6+
"crypto/sha256"
7+
"encoding/hex"
8+
"fmt"
9+
"time"
10+
11+
log "github.com/go-pkgz/lgr"
12+
"go.mongodb.org/mongo-driver/v2/bson"
13+
"go.mongodb.org/mongo-driver/v2/mongo"
14+
"go.mongodb.org/mongo-driver/v2/mongo/options"
15+
)
16+
17+
// Summary contains information about a cached summary
18+
type Summary struct {
19+
ID string `bson:"_id"` // sha256 hash of the content
20+
Content string `bson:"content"` // original content that was summarized (could be truncated for storage efficiency)
21+
Summary string `bson:"summary"` // generated summary
22+
Model string `bson:"model"` // openAI model used for summarisation
23+
CreatedAt time.Time `bson:"created_at"`
24+
UpdatedAt time.Time `bson:"updated_at"`
25+
ExpiresAt time.Time `bson:"expires_at"` // when this summary expires
26+
}
27+
28+
// SummariesDAO handles database operations for article summaries
29+
type SummariesDAO struct {
30+
Collection *mongo.Collection
31+
}
32+
33+
// Get returns summary by content hash
34+
func (s SummariesDAO) Get(ctx context.Context, content string) (Summary, bool) {
35+
contentHash := GenerateContentHash(content)
36+
res := s.Collection.FindOne(ctx, bson.M{"_id": contentHash})
37+
if res.Err() != nil {
38+
if res.Err() == mongo.ErrNoDocuments {
39+
return Summary{}, false
40+
}
41+
log.Printf("[WARN] can't get summary for hash %s: %v", contentHash, res.Err())
42+
return Summary{}, false
43+
}
44+
45+
summary := Summary{}
46+
if err := res.Decode(&summary); err != nil {
47+
log.Printf("[WARN] can't decode summary document for hash %s: %v", contentHash, err)
48+
return Summary{}, false
49+
}
50+
51+
return summary, true
52+
}
53+
54+
// Save creates or updates summary in the database
55+
func (s SummariesDAO) Save(ctx context.Context, summary Summary) error {
56+
if summary.ID == "" {
57+
summary.ID = GenerateContentHash(summary.Content)
58+
}
59+
60+
if summary.CreatedAt.IsZero() {
61+
summary.CreatedAt = time.Now()
62+
}
63+
summary.UpdatedAt = time.Now()
64+
65+
// set default expiration of 1 month if not specified
66+
if summary.ExpiresAt.IsZero() {
67+
summary.ExpiresAt = time.Now().AddDate(0, 1, 0)
68+
}
69+
70+
opts := options.UpdateOne().SetUpsert(true)
71+
_, err := s.Collection.UpdateOne(
72+
ctx,
73+
bson.M{"_id": summary.ID},
74+
bson.M{"$set": summary},
75+
opts,
76+
)
77+
if err != nil {
78+
return fmt.Errorf("failed to save summary: %w", err)
79+
}
80+
return nil
81+
}
82+
83+
// Delete removes summary from the database
84+
func (s SummariesDAO) Delete(ctx context.Context, contentHash string) error {
85+
_, err := s.Collection.DeleteOne(ctx, bson.M{"_id": contentHash})
86+
if err != nil {
87+
return fmt.Errorf("failed to delete summary: %w", err)
88+
}
89+
return nil
90+
}
91+
92+
// CleanupExpired removes all summaries that have expired
93+
func (s SummariesDAO) CleanupExpired(ctx context.Context) (int64, error) {
94+
now := time.Now()
95+
result, err := s.Collection.DeleteMany(
96+
ctx,
97+
bson.M{"expires_at": bson.M{"$lt": now}},
98+
)
99+
if err != nil {
100+
return 0, fmt.Errorf("failed to cleanup expired summaries: %w", err)
101+
}
102+
return result.DeletedCount, nil
103+
}
104+
105+
// GenerateContentHash creates a hash for the content to use as an ID
106+
func GenerateContentHash(content string) string {
107+
hash := sha256.Sum256([]byte(content))
108+
return hex.EncodeToString(hash[:])
109+
}

datastore/summaries_test.go

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
package datastore
2+
3+
import (
4+
"context"
5+
"os"
6+
"testing"
7+
"time"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
"go.mongodb.org/mongo-driver/v2/bson"
12+
"go.mongodb.org/mongo-driver/v2/mongo"
13+
)
14+
15+
func TestSummariesDAO_SaveAndGet(t *testing.T) {
16+
if _, ok := os.LookupEnv("ENABLE_MONGO_TESTS"); !ok {
17+
t.Skip("ENABLE_MONGO_TESTS env variable is not set")
18+
}
19+
20+
mdb, err := New("mongodb://localhost:27017", "test_ureadability", 0)
21+
require.NoError(t, err)
22+
23+
// create a unique collection for this test to avoid conflicts
24+
collection := mdb.client.Database(mdb.dbName).Collection("summaries_test")
25+
defer func() {
26+
_ = collection.Drop(context.Background())
27+
}()
28+
29+
// create an index on the expiresAt field
30+
_, err = collection.Indexes().CreateOne(context.Background(),
31+
mongo.IndexModel{
32+
Keys: bson.D{{Key: "expires_at", Value: 1}},
33+
})
34+
require.NoError(t, err)
35+
36+
dao := SummariesDAO{Collection: collection}
37+
38+
content := "This is a test article content. It should generate a unique hash."
39+
summary := Summary{
40+
Content: content,
41+
Summary: "This is a test summary of the article.",
42+
Model: "gpt-4o-mini",
43+
CreatedAt: time.Now(),
44+
}
45+
46+
// test saving a summary
47+
err = dao.Save(context.Background(), summary)
48+
require.NoError(t, err)
49+
50+
// test getting the summary
51+
foundSummary, found := dao.Get(context.Background(), content)
52+
assert.True(t, found)
53+
assert.Equal(t, summary.Summary, foundSummary.Summary)
54+
assert.Equal(t, summary.Model, foundSummary.Model)
55+
assert.NotEmpty(t, foundSummary.ID)
56+
57+
// test getting a non-existent summary
58+
_, found = dao.Get(context.Background(), "non-existent content")
59+
assert.False(t, found)
60+
61+
// test updating an existing summary
62+
updatedSummary := Summary{
63+
ID: foundSummary.ID,
64+
Content: content,
65+
Summary: "This is an updated summary.",
66+
Model: "gpt-4o-mini",
67+
CreatedAt: foundSummary.CreatedAt,
68+
}
69+
70+
err = dao.Save(context.Background(), updatedSummary)
71+
require.NoError(t, err)
72+
73+
foundSummary, found = dao.Get(context.Background(), content)
74+
assert.True(t, found)
75+
assert.Equal(t, "This is an updated summary.", foundSummary.Summary)
76+
assert.Equal(t, updatedSummary.CreatedAt, foundSummary.CreatedAt)
77+
assert.NotEqual(t, updatedSummary.UpdatedAt, foundSummary.UpdatedAt) // UpdatedAt should be set by the DAO
78+
79+
// test deleting a summary
80+
err = dao.Delete(context.Background(), foundSummary.ID)
81+
require.NoError(t, err)
82+
83+
_, found = dao.Get(context.Background(), content)
84+
assert.False(t, found)
85+
}
86+
87+
func TestGenerateContentHash(t *testing.T) {
88+
content1 := "This is a test content."
89+
content2 := "This is a different test content."
90+
91+
hash1 := GenerateContentHash(content1)
92+
hash2 := GenerateContentHash(content2)
93+
94+
assert.NotEqual(t, hash1, hash2)
95+
assert.Equal(t, hash1, GenerateContentHash(content1)) // same content should produce same hash
96+
assert.Equal(t, 64, len(hash1)) // SHA-256 produces 64 character hex string
97+
}
98+
99+
func TestSummariesDAO_CleanupExpired(t *testing.T) {
100+
if _, ok := os.LookupEnv("ENABLE_MONGO_TESTS"); !ok {
101+
t.Skip("ENABLE_MONGO_TESTS env variable is not set")
102+
}
103+
104+
mdb, err := New("mongodb://localhost:27017", "test_ureadability", 0)
105+
require.NoError(t, err)
106+
107+
// create a unique collection for this test to avoid conflicts
108+
collection := mdb.client.Database(mdb.dbName).Collection("summaries_expired_test")
109+
defer func() {
110+
_ = collection.Drop(context.Background())
111+
}()
112+
113+
// create an index on the expiresAt field
114+
_, err = collection.Indexes().CreateOne(context.Background(),
115+
mongo.IndexModel{
116+
Keys: bson.D{{Key: "expires_at", Value: 1}},
117+
})
118+
require.NoError(t, err)
119+
120+
dao := SummariesDAO{Collection: collection}
121+
ctx := context.Background()
122+
123+
// add expired summary
124+
expiredSummary := Summary{
125+
Content: "This is an expired summary",
126+
Summary: "Expired content",
127+
Model: "gpt-4o-mini",
128+
CreatedAt: time.Now().Add(-48 * time.Hour),
129+
UpdatedAt: time.Now().Add(-48 * time.Hour),
130+
ExpiresAt: time.Now().Add(-24 * time.Hour), // expired 24 hours ago
131+
}
132+
err = dao.Save(ctx, expiredSummary)
133+
require.NoError(t, err)
134+
135+
// add valid summary
136+
validSummary := Summary{
137+
Content: "This is a valid summary",
138+
Summary: "Valid content",
139+
Model: "gpt-4o-mini",
140+
CreatedAt: time.Now(),
141+
UpdatedAt: time.Now(),
142+
ExpiresAt: time.Now().Add(24 * time.Hour), // expires in 24 hours
143+
}
144+
err = dao.Save(ctx, validSummary)
145+
require.NoError(t, err)
146+
147+
// verify both summaries exist
148+
_, foundExpired := dao.Get(ctx, expiredSummary.Content)
149+
assert.True(t, foundExpired, "Expected to find expired summary before cleanup")
150+
151+
_, foundValid := dao.Get(ctx, validSummary.Content)
152+
assert.True(t, foundValid, "Expected to find valid summary before cleanup")
153+
154+
// run cleanup
155+
count, err := dao.CleanupExpired(ctx)
156+
require.NoError(t, err)
157+
assert.Equal(t, int64(1), count, "Expected to clean up exactly one record")
158+
159+
// verify expired summary is gone but valid remains
160+
_, foundExpired = dao.Get(ctx, expiredSummary.Content)
161+
assert.False(t, foundExpired, "Expected expired summary to be deleted")
162+
163+
_, foundValid = dao.Get(ctx, validSummary.Content)
164+
assert.True(t, foundValid, "Expected valid summary to still exist")
165+
}

0 commit comments

Comments
 (0)