Documentation
¶
Overview ¶
Package processor provides unified data processing capabilities
Index ¶
- func ValidatePaginationConfig(config PaginationConfig) error
- func ValidateSortConfig(config SortConfig) error
- type ArticleSorter
- func (as *ArticleSorter) SetSortConfig(config SortConfig)
- func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)
- func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)
- func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article
- type Config
- type Converter
- func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)
- func (c *Converter) ConvertToArticle(collectorArticle collector.Article) (*models.Article, error)
- func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)
- func (c *Converter) GenerateHash(title, url string) string
- func (c *Converter) GetConfig() ConverterConfig
- func (c *Converter) UpdateConfig(config ConverterConfig)
- type ConverterConfig
- type DocumentFrequency
- type KeywordMatcher
- type PaginationConfig
- type PaginationResult
- type ProcessOptions
- type Processor
- func (p *Processor) CalculateFrontendRelevance(article models.Article, query string) float64
- func (p *Processor) GetStats() ProcessorStats
- func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)
- func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)
- type ProcessorStats
- type RelevanceScorer
- func (rs *RelevanceScorer) AddToCorpus(article *models.Article)
- func (rs *RelevanceScorer) ClearCache()
- func (rs *RelevanceScorer) GetKeywords() []string
- func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency
- func (rs *RelevanceScorer) GetWeightConfig() WeightConfig
- func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64
- func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)
- func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)
- func (rs *RelevanceScorer) UpdateKeywords(keywords []string)
- type RepositoryPaginationResult
- type RepositorySorter
- func (rs *RepositorySorter) SetSortConfig(config SortConfig)
- func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)
- func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)
- func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository
- type SentenceScore
- type SimpleStemmer
- type SortBy
- type SortConfig
- type SortOrder
- type Summarizer
- type TFIDFScore
- type TermFrequency
- type UserPreferences
- type WeightConfig
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ValidatePaginationConfig ¶
func ValidatePaginationConfig(config PaginationConfig) error
ValidatePaginationConfig validates pagination configuration
func ValidateSortConfig ¶
func ValidateSortConfig(config SortConfig) error
ValidateSortConfig validates a sort configuration
Types ¶
type ArticleSorter ¶
type ArticleSorter struct {
// contains filtered or unexported fields
}
ArticleSorter handles multi-dimensional sorting and pagination of articles
func NewArticleSorter ¶
func NewArticleSorter(relevanceScorer *RelevanceScorer) *ArticleSorter
NewArticleSorter creates a new ArticleSorter instance
func (*ArticleSorter) SetSortConfig ¶
func (as *ArticleSorter) SetSortConfig(config SortConfig)
SetSortConfig updates the sorting configuration
func (*ArticleSorter) SetUserPreferences ¶
func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)
SetUserPreferences sets user-specific preferences for personalized ranking
func (*ArticleSorter) SortAndPaginate ¶
func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)
SortAndPaginate sorts articles and applies pagination
func (*ArticleSorter) SortArticles ¶
func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article
SortArticles sorts articles according to the configured criteria
type Config ¶
type Config struct {
EnableSummarization bool `json:"enableSummarization"`
EnableSorting bool `json:"enableSorting"`
MaxSummaryLength int `json:"maxSummaryLength"`
ProcessingTimeout time.Duration `json:"processingTimeout"`
MaxConcurrency int `json:"maxConcurrency"`
}
Config holds processor configuration
func DefaultConfig ¶
func DefaultConfig() *Config
DefaultConfig returns default processor configuration
type Converter ¶
type Converter struct {
// contains filtered or unexported fields
}
Converter handles conversion from raw collector data to unified Article/Repository models It provides thread-safe operations for concurrent processing and comprehensive data normalization
func NewConverter ¶
func NewConverter(config ConverterConfig) *Converter
NewConverter creates a new converter with the given configuration
func NewDefaultConverter ¶
func NewDefaultConverter() *Converter
NewDefaultConverter creates a new converter with default configuration
func (*Converter) BatchConvertArticles ¶
func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)
BatchConvertArticles converts multiple collector articles concurrently
func (*Converter) ConvertToArticle ¶
ConvertToArticle converts a collector.Article to models.Article with full data normalization
func (*Converter) ConvertToRepository ¶
func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)
ConvertToRepository converts API data to models.Repository for repository-type content
func (*Converter) GenerateHash ¶
GenerateHash creates a hash for deduplication purposes
func (*Converter) GetConfig ¶
func (c *Converter) GetConfig() ConverterConfig
GetConfig returns a copy of the current configuration
func (*Converter) UpdateConfig ¶
func (c *Converter) UpdateConfig(config ConverterConfig)
UpdateConfig updates the converter configuration in a thread-safe manner
type ConverterConfig ¶
type ConverterConfig struct {
// Maximum length for summary text (default: 1000)
MaxSummaryLength int
// Maximum length for title text (default: 500)
MaxTitleLength int
// Maximum length for content text (default: 50000)
MaxContentLength int
// Default quality score for articles without explicit quality indicators
DefaultQuality float64
// Default relevance score for articles without relevance calculation
DefaultRelevance float64
// Enable aggressive HTML cleaning (removes more tags and attributes)
AggressiveHTMLCleaning bool
// Normalize URLs to canonical form (removes tracking parameters, etc.)
NormalizeURLs bool
// Time zone for date normalization (default: UTC)
TimeZone *time.Location
}
ConverterConfig contains configuration options for data conversion
func DefaultConverterConfig ¶
func DefaultConverterConfig() ConverterConfig
DefaultConverterConfig returns a configuration with sensible defaults
type DocumentFrequency ¶
DocumentFrequency represents how many documents contain a specific term
type KeywordMatcher ¶
type KeywordMatcher struct {
Keywords []string `json:"keywords"`
WeightConfig WeightConfig `json:"weightConfig"`
// contains filtered or unexported fields
}
KeywordMatcher handles keyword matching with weighted scoring
func (*KeywordMatcher) ScoreKeywordMatch ¶
func (km *KeywordMatcher) ScoreKeywordMatch(article *models.Article) float64
ScoreKeywordMatch calculates keyword matching score with weights
type PaginationConfig ¶
type PaginationConfig struct {
Page int `json:"page"` // Current page (1-based)
PageSize int `json:"pageSize"` // Items per page
}
PaginationConfig defines pagination settings
func GetDefaultPaginationConfig ¶
func GetDefaultPaginationConfig() PaginationConfig
GetDefaultPaginationConfig returns default pagination configuration
type PaginationResult ¶
type PaginationResult struct {
Items []*models.Article `json:"items"`
CurrentPage int `json:"currentPage"`
PageSize int `json:"pageSize"`
TotalItems int `json:"totalItems"`
TotalPages int `json:"totalPages"`
HasNext bool `json:"hasNext"`
HasPrev bool `json:"hasPrev"`
}
PaginationResult contains paginated results with metadata
type ProcessOptions ¶
type ProcessOptions struct {
Query string `json:"query"`
SortBy string `json:"sortBy"`
SortOrder string `json:"sortOrder"`
Limit int `json:"limit"`
}
ProcessOptions defines options for processing
type Processor ¶
type Processor struct {
// contains filtered or unexported fields
}
Processor provides unified data processing functionality
func NewProcessor ¶
NewProcessor creates a new processor instance
func (*Processor) CalculateFrontendRelevance ¶
CalculateFrontendRelevance calculates how relevant an article is to frontend development
func (*Processor) GetStats ¶
func (p *Processor) GetStats() ProcessorStats
GetStats returns processor statistics
func (*Processor) ProcessArticles ¶
func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)
ProcessArticles processes a slice of articles with various enhancements
func (*Processor) ProcessRepositories ¶
func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)
ProcessRepositories processes a slice of repositories
type ProcessorStats ¶
type ProcessorStats struct {
ProcessedArticles int `json:"processedArticles"`
ProcessedRepositories int `json:"processedRepositories"`
AverageProcessingTime time.Duration `json:"averageProcessingTime"`
CacheHitRate float64 `json:"cacheHitRate"`
}
ProcessorStats holds processor performance statistics
type RelevanceScorer ¶
type RelevanceScorer struct {
// contains filtered or unexported fields
}
RelevanceScorer handles content relevance scoring using TF-IDF and keyword matching
func NewRelevanceScorer ¶
func NewRelevanceScorer(keywords []string) *RelevanceScorer
NewRelevanceScorer creates a new instance of RelevanceScorer
func (*RelevanceScorer) AddToCorpus ¶
func (rs *RelevanceScorer) AddToCorpus(article *models.Article)
AddToCorpus adds an article to the corpus
func (*RelevanceScorer) ClearCache ¶
func (rs *RelevanceScorer) ClearCache()
ClearCache clears all internal caches
func (*RelevanceScorer) GetKeywords ¶
func (rs *RelevanceScorer) GetKeywords() []string
GetKeywords returns the current keywords
func (*RelevanceScorer) GetTopTerms ¶
func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency
GetTopTerms returns the top N terms for an article based on TF-IDF
func (*RelevanceScorer) GetWeightConfig ¶
func (rs *RelevanceScorer) GetWeightConfig() WeightConfig
GetWeightConfig returns the current weight configuration
func (*RelevanceScorer) ScoreRelevance ¶
func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64
ScoreRelevance calculates the relevance score for an article
func (*RelevanceScorer) SetCorpus ¶
func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)
SetCorpus sets the corpus for IDF calculation
func (*RelevanceScorer) SetWeightConfig ¶
func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)
SetWeightConfig updates the weight configuration
func (*RelevanceScorer) UpdateKeywords ¶
func (rs *RelevanceScorer) UpdateKeywords(keywords []string)
UpdateKeywords updates the keywords used for scoring
type RepositoryPaginationResult ¶
type RepositoryPaginationResult struct {
Items []*models.Repository `json:"items"`
CurrentPage int `json:"currentPage"`
PageSize int `json:"pageSize"`
TotalItems int `json:"totalItems"`
TotalPages int `json:"totalPages"`
HasNext bool `json:"hasNext"`
HasPrev bool `json:"hasPrev"`
}
RepositoryPaginationResult contains paginated repository results
type RepositorySorter ¶
type RepositorySorter struct {
// contains filtered or unexported fields
}
RepositorySorter handles sorting of repositories
func NewRepositorySorter ¶
func NewRepositorySorter() *RepositorySorter
NewRepositorySorter creates a new RepositorySorter instance
func (*RepositorySorter) SetSortConfig ¶
func (rs *RepositorySorter) SetSortConfig(config SortConfig)
SetSortConfig updates the sorting configuration for repositories
func (*RepositorySorter) SetUserPreferences ¶
func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)
SetUserPreferences sets user preferences for repository sorting
func (*RepositorySorter) SortAndPaginate ¶
func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)
SortAndPaginate sorts repositories and applies pagination
func (*RepositorySorter) SortRepositories ¶
func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository
SortRepositories sorts repositories according to the configured criteria
type SentenceScore ¶
SentenceScore represents a sentence with its calculated importance score
type SimpleStemmer ¶
type SimpleStemmer struct {
// contains filtered or unexported fields
}
SimpleStemmer provides basic word stemming functionality
func NewSimpleStemmer ¶
func NewSimpleStemmer() *SimpleStemmer
NewSimpleStemmer creates a new stemmer instance
func (*SimpleStemmer) Stem ¶
func (ss *SimpleStemmer) Stem(word string) string
Stem applies basic stemming to a word
type SortBy ¶
type SortBy string
SortBy defines the primary sorting criteria
const ( SortByRelevance SortBy = "relevance" // Sort by relevance score SortByTime SortBy = "time" // Sort by publication time SortByPopularity SortBy = "popularity" // Sort by quality/popularity metrics SortByTrend SortBy = "trend" // Sort by trending score SortByComposite SortBy = "composite" // Weighted combination of multiple factors )
type SortConfig ¶
type SortConfig struct {
Primary SortBy `json:"primary"` // Primary sorting criterion
Secondary SortBy `json:"secondary"` // Secondary sorting criterion (for tie-breaking)
Order SortOrder `json:"order"` // Sort order
// Weights for composite sorting
RelevanceWeight float64 `json:"relevanceWeight"` // Weight for relevance (default: 0.4)
TimeWeight float64 `json:"timeWeight"` // Weight for recency (default: 0.3)
PopularityWeight float64 `json:"popularityWeight"` // Weight for popularity (default: 0.2)
TrendWeight float64 `json:"trendWeight"` // Weight for trending (default: 0.1)
}
SortConfig defines the sorting configuration
func GetDefaultSortConfig ¶
func GetDefaultSortConfig() SortConfig
GetDefaultSortConfig returns default sorting configuration
type Summarizer ¶
type Summarizer struct {
// Configuration for summary generation
MinSummaryLength int // Minimum summary length in characters
MaxSummaryLength int // Maximum summary length in characters
SentenceCount int // Target number of sentences in summary
PositionWeight float64 // Weight for sentence position (earlier sentences score higher)
LengthWeight float64 // Weight for sentence length
KeywordWeight float64 // Weight for keyword density
}
Summarizer handles intelligent content summarization and processing
func NewSummarizer ¶
func NewSummarizer() *Summarizer
NewSummarizer creates a new Summarizer with default configuration
func (*Summarizer) AssessQuality ¶
func (s *Summarizer) AssessQuality(article *models.Article) float64
AssessQuality evaluates content quality based on multiple factors
func (*Summarizer) CleanContent ¶
func (s *Summarizer) CleanContent(article *models.Article) error
CleanContent performs content cleaning and preprocessing
func (*Summarizer) GenerateSummary ¶
func (s *Summarizer) GenerateSummary(text string) (string, error)
GenerateSummary creates an intelligent summary of the given text Uses sentence extraction based on position, length, and keyword density
func (*Summarizer) ProcessArticle ¶
func (s *Summarizer) ProcessArticle(article *models.Article) error
ProcessArticle performs complete content processing on an article This includes cleaning, summarization, and quality assessment
type TFIDFScore ¶
type TFIDFScore struct {
Term string `json:"term"`
TF float64 `json:"tf"` // Term Frequency
IDF float64 `json:"idf"` // Inverse Document Frequency
TFIDF float64 `json:"tfidf"` // TF-IDF Score
Weight float64 `json:"weight"` // Additional weight based on position/importance
}
TFIDFScore represents the TF-IDF score for a term in a document
type TermFrequency ¶
type TermFrequency struct {
Term string `json:"term"`
Count int `json:"count"`
Frequency float64 `json:"frequency"`
}
TermFrequency represents the frequency of a term in a document
type UserPreferences ¶
type UserPreferences struct {
FavoriteTopics []string `json:"favoriteTopics"` // Preferred topic keywords
PreferredSources []string `json:"preferredSources"` // Preferred news sources
ReadingHistory []string `json:"readingHistory"` // Article IDs user has read
TopicWeights map[string]float64 `json:"topicWeights"` // Custom weights for topics
RecencyPreference float64 `json:"recencyPreference"` // How much user prefers recent articles (0-1)
LanguagePrefs []string `json:"languagePrefs"` // Preferred programming languages for repos
}
UserPreferences defines user-specific ranking preferences
type WeightConfig ¶
type WeightConfig struct {
TitleWeight float64 `json:"titleWeight"` // Weight for title matches (default: 3.0)
SummaryWeight float64 `json:"summaryWeight"` // Weight for summary matches (default: 2.0)
ContentWeight float64 `json:"contentWeight"` // Weight for content matches (default: 1.0)
TagWeight float64 `json:"tagWeight"` // Weight for tag matches (default: 4.0)
ExactMatch float64 `json:"exactMatch"` // Bonus for exact keyword match (default: 1.5)
PartialMatch float64 `json:"partialMatch"` // Score for partial match (default: 0.8)
}
WeightConfig defines scoring weights for different text sections