API 参考
使用指南-核心模块详解
核心模块能力说明。
核心模块详解
<!-- ### 1. 数据访问层 (Data Access Layer)
nebula-data-access
核心抽象层,提供统一的数据访问接口
// 统一Repository接口
@Repository
public class UserRepository extends AbstractRepository<User, Long> {
@Override
protected Class<User> getEntityClass() {
return User.class;
}
}
// 查询构建器
QueryBuilder query = DefaultQueryBuilder.create()
.eq("status", "ACTIVE")
.like("name", "张%")
.gt("createTime", lastWeek)
.build();
``` -->
#### nebula-data-persistence (MyBatis-Plus集成)
**关系型数据库持久化支持**
```yaml
nebula:
data:
persistence:
# 读写分离配置
read-write-separation:
enabled: true
master:
url: jdbc:mysql://master:3306/nebula
username: root
password: password
slave:
url: jdbc:mysql://slave:3306/nebula
username: reader
password: password
# 分库分表配置
sharding:
enabled: true
tables:
user:
actual-data-nodes: ds_${0..1}.user_${0..3}
table-strategy:
inline:
sharding-column: id
algorithm-expression: user_${id % 4}
<!-- #### nebula-data-mongodb (NoSQL支持) MongoDB集成支持
@Service
public class DocumentService {
@Autowired
private MongoRepository mongoRepository;
public void saveDocument(Document doc) {
mongoRepository.save(doc);
}
public List<Document> findByCategory(String category) {
return mongoRepository.findByCategory(category);
}
}
``` -->
#### nebula-data-cache (缓存支持)
**多级缓存管理**
```yaml
nebula:
data:
cache:
# 多级缓存配置
multi-level:
enabled: true
local:
type: caffeine
max-size: 10000
expire-after-write: 5m
remote:
type: redis
expire-after-write: 1h
key-prefix: "nebula:"
2. 消息传递层 (Messaging Layer)
nebula-messaging-core & nebula-messaging-rabbitmq
消息队列抽象和RabbitMQ实现
@Service
public class NotificationService {
@Autowired
private MessageManager messageManager;
public void sendNotification(String userId, String message) {
Message<String> msg = Message.<String>builder()
.topic("user-notifications")
.payload(message)
.build();
messageManager.getProducer().send("user-notifications", msg);
}
@MessageHandler("user-notifications")
public void handleNotification(Message<String> message) {
// 处理通知消息
log.info("处理通知: {}", message.getPayload());
}
}
3. 服务发现与RPC层
nebula-discovery-core & nebula-discovery-nacos
服务注册发现
nebula:
discovery:
nacos:
enabled: true
server-addr: localhost:8848
namespace: nebula-dev
group: DEFAULT_GROUP
nebula-rpc-core & nebula-rpc-http
远程调用支持
@RpcClient("user-service")
public interface UserRpcClient {
@RpcCall("/api/users/{id}")
User getUserById(@PathParam("id") Long id);
@RpcCall(value = "/api/users", method = "POST")
User createUser(@RequestBody CreateUserRequest request);
}
4. 对象存储层 (Storage Layer)
nebula-storage-core, nebula-storage-minio, nebula-storage-aliyun-oss
统一对象存储接口
@Service
public class FileService {
@Autowired
private StorageService storageService;
public String uploadFile(MultipartFile file) {
ObjectMetadata metadata = ObjectMetadata.builder()
.contentType(file.getContentType())
.contentLength(file.getSize())
.build();
StorageResult result = storageService.upload(
"documents/" + file.getOriginalFilename(),
file.getInputStream(),
metadata
);
return result.getUrl();
}
}
配置示例:
nebula:
storage:
# MinIO配置
minio:
enabled: true
endpoint: http://localhost:9000
access-key: minioadmin
secret-key: minioadmin
default-bucket: nebula-files
# 阿里云OSS配置
aliyun:
oss:
enabled: false
endpoint: https://oss-cn-hangzhou.aliyuncs.com
access-key-id: ${ALIYUN_ACCESS_KEY}
access-key-secret: ${ALIYUN_SECRET_KEY}
default-bucket: nebula-oss
5. 搜索引擎层 (Search Layer)
nebula-search-core & nebula-search-elasticsearch
全文搜索支持
@Service
public class ProductSearchService {
@Autowired
private SearchService searchService;
public void indexProduct(Product product) {
SearchDocument document = SearchDocument.builder()
.id(product.getId().toString())
.content(product.getName() + " " + product.getDescription())
.metadata(Map.of(
"category", product.getCategory(),
"price", product.getPrice(),
"brand", product.getBrand()
))
.build();
searchService.index("products", document);
}
public SearchResult searchProducts(String query, String category) {
SearchQuery searchQuery = SearchQuery.builder()
.query(query)
.filter("category", category)
.size(20)
.build();
return searchService.search("products", searchQuery);
}
}
6. 第三方集成层 (Integration Layer)
nebula-integration-payment
支付集成抽象
@Service
public class OrderService {
@Autowired
private PaymentService paymentService;
public PaymentResponse createPayment(Order order) {
PaymentRequest request = PaymentRequest.builder()
.orderNo(order.getOrderNo())
.amount(order.getTotalAmount())
.currency("CNY")
.subject(order.getTitle())
.buyerInfo(BuyerInfo.builder()
.buyerId(order.getUserId().toString())
.buyerName(order.getUserName())
.build())
.build();
return paymentService.createPayment(request);
}
}
配置示例:
nebula:
payment:
# Mock支付(开发测试)
mock:
enabled: true
auto-success-delay: 60
# 支付宝配置
alipay:
enabled: false
app-id: ${ALIPAY_APP_ID}
private-key: ${ALIPAY_PRIVATE_KEY}
public-key: ${ALIPAY_PUBLIC_KEY}
# 微信支付配置
wechat-pay:
enabled: false
app-id: ${WECHAT_APP_ID}
mch-id: ${WECHAT_MCH_ID}
mch-key: ${WECHAT_MCH_KEY}
7. 人工智能层 (AI Layer)
nebula-ai-core & nebula-ai-spring
AI能力集成 (基于Spring AI)
@Service
public class AIService {
@Autowired
private ChatService chatService;
@Autowired
private EmbeddingService embeddingService;
@Autowired
private VectorStoreService vectorStoreService;
// 智能聊天
public String chat(String message) {
ChatResponse response = chatService.chat(message);
return response.getContent();
}
// 文档智能问答 (RAG)
public String intelligentQA(String question) {
// 1. 搜索相关文档
SearchResult searchResult = vectorStoreService.search(question, 3);
// 2. 构建上下文
String context = searchResult.getContents()
.stream()
.collect(Collectors.joining("\n"));
// 3. 生成回答
List<ChatMessage> messages = List.of(
ChatMessage.system("基于以下上下文回答问题:\n" + context),
ChatMessage.user(question)
);
return chatService.chat(messages).getContent();
}
}
配置示例:
nebula:
ai:
enabled: true
# 聊天配置
chat:
default-provider: openai
providers:
openai:
api-key: ${OPENAI_API_KEY}
model: gpt-3.5-turbo
options:
temperature: 0.7
max-tokens: 1000
# 嵌入配置
embedding:
default-provider: openai
providers:
openai:
api-key: ${OPENAI_API_KEY}
model: text-embedding-ada-002
# 向量存储配置
vector-store:
default-provider: chroma
providers:
chroma:
host: localhost
port: 8000
collection-name: nebula-docs
8. 网页爬虫层 (Crawler Layer)
Nebula Crawler 是一套完整的网页数据采集模块组,支持 HTTP 和浏览器两种采集模式,提供代理池管理和验证码处理能力。
模块架构
graph TB
subgraph 爬虫模块组
A[nebula-crawler-core<br/>核心抽象层] --> B[nebula-crawler-http<br/>HTTP爬虫引擎]
A --> C[nebula-crawler-browser<br/>浏览器爬虫引擎]
A --> D[nebula-crawler-proxy<br/>代理池管理]
A --> E[nebula-crawler-captcha<br/>验证码处理]
end
B --> F[OkHttp 客户端]
C --> G[Playwright]
D --> H[Redis 缓存]
E --> I[ddddocr/第三方API]
nebula-crawler-core (核心抽象层)
统一的爬虫引擎接口和请求/响应模型
// 爬虫引擎接口
public interface CrawlerEngine {
String getType(); // 引擎类型
CrawlerResponse crawl(CrawlerRequest request); // 同步爬取
CompletableFuture<CrawlerResponse> crawlAsync(CrawlerRequest request); // 异步爬取
List<CrawlerResponse> crawlBatch(List<CrawlerRequest> requests); // 批量爬取
void shutdown(); // 关闭引擎
boolean isHealthy(); // 健康检查
}
// 构建请求
CrawlerRequest request = CrawlerRequest.get("https://example.com")
.header("Accept", "text/html")
.timeout(30000)
.build();
// 浏览器渲染请求
CrawlerRequest jsRequest = CrawlerRequest.renderPage("https://spa-app.com")
.waitSelector("#content")
.waitTimeout(5000)
.screenshot(true)
.build();
// 处理响应
CrawlerResponse response = engine.crawl(request);
if (response.isSuccess()) {
// 解析为 Jsoup Document
Document doc = response.asDocument();
String title = doc.select("title").text();
// 或解析为 JSON
Map<String, Object> data = response.asMap();
}
nebula-crawler-http (HTTP爬虫引擎)
基于 OkHttp 的高性能 HTTP 爬虫
@Service
public class DataCrawlerService {
@Autowired
private HttpCrawlerEngine httpEngine;
public String fetchPage(String url) {
CrawlerRequest request = CrawlerRequest.get(url)
.header("User-Agent", "Mozilla/5.0...")
.retryCount(3)
.build();
CrawlerResponse response = httpEngine.crawl(request);
if (response.isSuccess()) {
return response.getContent();
} else {
log.error("爬取失败: {}, 错误: {}", url, response.getErrorMessage());
return null;
}
}
// 批量爬取
public List<String> fetchPages(List<String> urls) {
List<CrawlerRequest> requests = urls.stream()
.map(CrawlerRequest::get)
.collect(Collectors.toList());
return httpEngine.crawlBatch(requests).stream()
.filter(CrawlerResponse::isSuccess)
.map(CrawlerResponse::getContent)
.collect(Collectors.toList());
}
}
特性: - 连接池管理(可配置最大连接数、保活时间) - User-Agent 轮换 - QPS 限流 - 自动重试 - 代理支持
nebula-crawler-browser (浏览器爬虫引擎)
基于 Playwright 的浏览器自动化引擎
@Service
public class DynamicPageCrawler {
@Autowired
private BrowserCrawlerEngine browserEngine;
public String crawlDynamicPage(String url) {
CrawlerRequest request = CrawlerRequest.renderPage(url)
.waitSelector(".content-loaded") // 等待元素出现
.waitTimeout(10000) // 等待超时
.screenshot(true) // 截图
.build();
CrawlerResponse response = browserEngine.crawl(request);
if (response.isSuccess()) {
// 获取渲染后的 HTML
String html = response.getContent();
// 获取截图
byte[] screenshot = response.getScreenshot();
return html;
}
return null;
}
}
运行模式:
- LOCAL:本地启动浏览器实例,适合开发调试
- REMOTE:连接远程 Playwright Server,支持 Docker/K8s 部署
远程模式配置:
nebula:
crawler:
browser:
enabled: true
mode: REMOTE
remote:
endpoints:
- ws://playwright-server-01:9222
- ws://playwright-server-02:9222
load-balance-strategy: ROUND_ROBIN
nebula-crawler-proxy (代理池管理)
统一的代理IP管理和轮换
@Service
public class ProxyCrawlerService {
@Autowired
private HttpCrawlerEngine engine;
@Autowired
private ProxyProvider proxyProvider;
public String crawlWithProxy(String url) {
// 获取可用代理
Proxy proxy = proxyProvider.getProxy();
CrawlerRequest request = CrawlerRequest.get(url)
.proxy(proxy)
.build();
CrawlerResponse response = engine.crawl(request);
// 上报代理使用结果
if (response.isSuccess()) {
proxyProvider.reportSuccess(proxy);
} else {
proxyProvider.reportFailure(proxy);
}
return response.getContent();
}
}
代理源配置:
nebula:
crawler:
proxy:
enabled: true
min-available: 10
check-url: https://www.baidu.com
# 静态代理
static-proxies:
- http://proxy1:8080
- socks5://proxy2:1080
# API代理源
api-sources:
- name: provider1
url: http://api.proxy-provider.com/get
format: json
特性: - 多代理源支持(静态配置、API获取) - 自动健康检查和失效剔除 - 智能轮换策略 - Redis 持久化存储
nebula-crawler-captcha (验证码处理)
多类型验证码识别和处理
@Service
public class CaptchaCrawler {
@Autowired
private CaptchaManager captchaManager;
public String solveCaptcha(byte[] imageData, CaptchaType type) {
switch (type) {
case IMAGE:
// 图片验证码识别
return captchaManager.solveImage(imageData);
case SLIDER:
// 滑块验证码 - 返回滑动距离
return captchaManager.solveSlider(backgroundImage, sliderImage);
case CLICK:
// 点选验证码 - 返回点击坐标
return captchaManager.solveClick(imageData, targetText);
default:
throw new UnsupportedOperationException("不支持的验证码类型");
}
}
}
支持的验证码类型: - 图片验证码(本地 ddddocr 识别) - 滑块验证码(OpenCV 计算滑动距离) - 点选验证码 - 旋转验证码 - 第三方平台(2Captcha、Anti-Captcha)
配置示例:
nebula:
crawler:
captcha:
enabled: true
local-ocr-enabled: true
ddddocr-url: http://ddddocr-service:8866
opencv-url: http://opencv-service:8867
providers:
- name: 2captcha
api-key: ${CAPTCHA_API_KEY}
enabled: true
priority: 1
Maven 依赖
<!-- 完整爬虫功能 -->
<dependency>
<groupId>io.nebula</groupId>
<artifactId>nebula-crawler-core</artifactId>
</dependency>
<dependency>
<groupId>io.nebula</groupId>
<artifactId>nebula-crawler-http</artifactId>
</dependency>
<dependency>
<groupId>io.nebula</groupId>
<artifactId>nebula-crawler-browser</artifactId>
</dependency>
<dependency>
<groupId>io.nebula</groupId>
<artifactId>nebula-crawler-proxy</artifactId>
</dependency>
<dependency>
<groupId>io.nebula</groupId>
<artifactId>nebula-crawler-captcha</artifactId>
</dependency>
完整使用示例
@Service
@Slf4j
public class SupplierDataCrawler {
@Autowired
private HttpCrawlerEngine httpEngine;
@Autowired
private BrowserCrawlerEngine browserEngine;
@Autowired
private ProxyProvider proxyProvider;
@Autowired
private CaptchaManager captchaManager;
/**
* 采集供应商数据
*/
public SupplierData crawlSupplier(String companyName) {
// 1. 静态页面采集(使用HTTP引擎)
CrawlerRequest listRequest = CrawlerRequest.get("https://example.com/search")
.param("keyword", companyName)
.build();
CrawlerResponse listResponse = httpEngine.crawl(listRequest);
String detailUrl = parseDetailUrl(listResponse);
// 2. 动态页面采集(使用浏览器引擎)
CrawlerRequest detailRequest = CrawlerRequest.renderPage(detailUrl)
.waitSelector(".company-info")
.proxy(proxyProvider.getProxy())
.build();
CrawlerResponse detailResponse = browserEngine.crawl(detailRequest);
// 3. 解析数据
Document doc = detailResponse.asDocument();
return parseSupplierData(doc);
}
}